diff mbox

[RFC,2/2] iommu: rockchip: Handle system-wide and runtime PM

Message ID 1418286387-9663-3-git-send-email-tfiga@chromium.org (mailing list archive)
State New, archived
Headers show

Commit Message

Tomasz Figa Dec. 11, 2014, 8:26 a.m. UTC
This patch modifies the rockchip-iommu driver to consider state of
the power domain the IOMMU is located in. When the power domain
is powered off, the IOMMU cannot be accessed and register programming
must be deferred until the power domain becomes enabled.

The solution implemented in this patch uses power domain notifications
to perform necessary IOMMU initialization. Race with runtime PM core
is avoided by protecting code accessing the hardware, including startup
and shutdown functions, with a spinlock with a check for power state
inside.

Signed-off-by: Tomasz Figa <tfiga@chromium.org>
---
 drivers/iommu/rockchip-iommu.c | 208 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 172 insertions(+), 36 deletions(-)

Comments

Ulf Hansson Dec. 11, 2014, 11:58 a.m. UTC | #1
On 11 December 2014 at 09:26, Tomasz Figa <tfiga@chromium.org> wrote:
> This patch modifies the rockchip-iommu driver to consider state of
> the power domain the IOMMU is located in. When the power domain
> is powered off, the IOMMU cannot be accessed and register programming
> must be deferred until the power domain becomes enabled.
>
> The solution implemented in this patch uses power domain notifications
> to perform necessary IOMMU initialization. Race with runtime PM core
> is avoided by protecting code accessing the hardware, including startup
> and shutdown functions, with a spinlock with a check for power state
> inside.
>
> Signed-off-by: Tomasz Figa <tfiga@chromium.org>
> ---
>  drivers/iommu/rockchip-iommu.c | 208 ++++++++++++++++++++++++++++++++++-------
>  1 file changed, 172 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
> index b2023af..9120655 100644
> --- a/drivers/iommu/rockchip-iommu.c
> +++ b/drivers/iommu/rockchip-iommu.c
> @@ -20,6 +20,8 @@
>  #include <linux/of.h>
>  #include <linux/of_platform.h>
>  #include <linux/platform_device.h>
> +#include <linux/pm_domain.h>
> +#include <linux/pm_runtime.h>
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
>
> @@ -88,6 +90,9 @@ struct rk_iommu {
>         int irq;
>         struct list_head node; /* entry in rk_iommu_domain.iommus */
>         struct iommu_domain *domain; /* domain to which iommu is attached */
> +       struct notifier_block genpd_nb;
> +       spinlock_t hw_lock; /* lock for register access */
> +       bool is_powered; /* power domain is on and register clock is enabled */
>  };
>
>  static inline void rk_table_flush(u32 *va, unsigned int count)
> @@ -283,6 +288,9 @@ static void rk_iommu_zap_lines(struct rk_iommu *iommu, dma_addr_t iova,
>                                size_t size)
>  {
>         dma_addr_t iova_end = iova + size;
> +
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         /*
>          * TODO(djkurtz): Figure out when it is more efficient to shootdown the
>          * entire iotlb rather than iterate over individual iovas.
> @@ -293,11 +301,15 @@ static void rk_iommu_zap_lines(struct rk_iommu *iommu, dma_addr_t iova,
>
>  static bool rk_iommu_is_stall_active(struct rk_iommu *iommu)
>  {
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         return rk_iommu_read(iommu, RK_MMU_STATUS) & RK_MMU_STATUS_STALL_ACTIVE;
>  }
>
>  static bool rk_iommu_is_paging_enabled(struct rk_iommu *iommu)
>  {
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         return rk_iommu_read(iommu, RK_MMU_STATUS) &
>                              RK_MMU_STATUS_PAGING_ENABLED;
>  }
> @@ -306,6 +318,8 @@ static int rk_iommu_enable_stall(struct rk_iommu *iommu)
>  {
>         int ret;
>
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         if (rk_iommu_is_stall_active(iommu))
>                 return 0;
>
> @@ -327,6 +341,8 @@ static int rk_iommu_disable_stall(struct rk_iommu *iommu)
>  {
>         int ret;
>
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         if (!rk_iommu_is_stall_active(iommu))
>                 return 0;
>
> @@ -344,6 +360,8 @@ static int rk_iommu_enable_paging(struct rk_iommu *iommu)
>  {
>         int ret;
>
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         if (rk_iommu_is_paging_enabled(iommu))
>                 return 0;
>
> @@ -361,6 +379,8 @@ static int rk_iommu_disable_paging(struct rk_iommu *iommu)
>  {
>         int ret;
>
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         if (!rk_iommu_is_paging_enabled(iommu))
>                 return 0;
>
> @@ -379,6 +399,8 @@ static int rk_iommu_force_reset(struct rk_iommu *iommu)
>         int ret;
>         u32 dte_addr;
>
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         /*
>          * Check if register DTE_ADDR is working by writing DTE_ADDR_DUMMY
>          * and verifying that upper 5 nybbles are read back.
> @@ -401,6 +423,50 @@ static int rk_iommu_force_reset(struct rk_iommu *iommu)
>         return ret;
>  }
>
> +static int rk_iommu_startup(struct rk_iommu *iommu)
> +{
> +       struct iommu_domain *domain = iommu->domain;
> +       struct rk_iommu_domain *rk_domain;
> +       phys_addr_t dte_addr;
> +       int ret;
> +
> +       assert_spin_locked(&iommu->hw_lock);
> +
> +       ret = rk_iommu_enable_stall(iommu);
> +       if (ret)
> +               return ret;
> +
> +       ret = rk_iommu_force_reset(iommu);
> +       if (ret)
> +               return ret;
> +
> +       rk_domain = domain->priv;
> +       dte_addr = virt_to_phys(rk_domain->dt);
> +       rk_iommu_write(iommu, RK_MMU_DTE_ADDR, dte_addr);
> +       rk_iommu_command(iommu, RK_MMU_CMD_ZAP_CACHE);
> +       rk_iommu_write(iommu, RK_MMU_INT_MASK, RK_MMU_IRQ_MASK);
> +
> +       ret = rk_iommu_enable_paging(iommu);
> +       if (ret)
> +               return ret;
> +
> +       rk_iommu_disable_stall(iommu);
> +
> +       return ret;
> +}
> +
> +static void rk_iommu_shutdown(struct rk_iommu *iommu)
> +{
> +       assert_spin_locked(&iommu->hw_lock);
> +
> +       /* Ignore error while disabling, just keep going */
> +       rk_iommu_enable_stall(iommu);
> +       rk_iommu_disable_paging(iommu);
> +       rk_iommu_write(iommu, RK_MMU_INT_MASK, 0);
> +       rk_iommu_write(iommu, RK_MMU_DTE_ADDR, 0);
> +       rk_iommu_disable_stall(iommu);
> +}
> +
>  static void log_iova(struct rk_iommu *iommu, dma_addr_t iova)
>  {
>         u32 dte_index, pte_index, page_offset;
> @@ -414,6 +480,8 @@ static void log_iova(struct rk_iommu *iommu, dma_addr_t iova)
>         phys_addr_t page_addr_phys = 0;
>         u32 page_flags = 0;
>
> +       assert_spin_locked(&iommu->hw_lock);
> +
>         dte_index = rk_iova_dte_index(iova);
>         pte_index = rk_iova_pte_index(iova);
>         page_offset = rk_iova_page_offset(iova);
> @@ -450,13 +518,22 @@ print_it:
>  static irqreturn_t rk_iommu_irq(int irq, void *dev_id)
>  {
>         struct rk_iommu *iommu = dev_id;
> +       irqreturn_t ret = IRQ_HANDLED;
> +       unsigned long flags;
>         u32 status;
>         u32 int_status;
>         dma_addr_t iova;
>
> +       spin_lock_irqsave(&iommu->hw_lock, flags);
> +
> +       if (!iommu->is_powered)
> +               goto done;
> +
>         int_status = rk_iommu_read(iommu, RK_MMU_INT_STATUS);
> -       if (int_status == 0)
> -               return IRQ_NONE;
> +       if (int_status == 0) {
> +               ret = IRQ_NONE;
> +               goto done;
> +       }
>
>         iova = rk_iommu_read(iommu, RK_MMU_PAGE_FAULT_ADDR);
>
> @@ -497,7 +574,10 @@ static irqreturn_t rk_iommu_irq(int irq, void *dev_id)
>
>         rk_iommu_write(iommu, RK_MMU_INT_CLEAR, int_status);
>
> -       return IRQ_HANDLED;
> +done:
> +       spin_unlock_irqrestore(&iommu->hw_lock, flags);
> +
> +       return ret;
>  }
>
>  static phys_addr_t rk_iommu_iova_to_phys(struct iommu_domain *domain,
> @@ -537,9 +617,15 @@ static void rk_iommu_zap_iova(struct rk_iommu_domain *rk_domain,
>         /* shootdown these iova from all iommus using this domain */
>         spin_lock_irqsave(&rk_domain->iommus_lock, flags);
>         list_for_each(pos, &rk_domain->iommus) {
> -               struct rk_iommu *iommu;
> -               iommu = list_entry(pos, struct rk_iommu, node);
> -               rk_iommu_zap_lines(iommu, iova, size);
> +               struct rk_iommu *iommu = list_entry(pos, struct rk_iommu, node);
> +
> +               spin_lock(&iommu->hw_lock);
> +
> +               /* Only zap TLBs of IOMMUs that are powered on. */
> +               if (iommu->is_powered)
> +                       rk_iommu_zap_lines(iommu, iova, size);
> +
> +               spin_unlock(&iommu->hw_lock);
>         }
>         spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
>  }
> @@ -729,7 +815,6 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
>         struct rk_iommu_domain *rk_domain = domain->priv;
>         unsigned long flags;
>         int ret;
> -       phys_addr_t dte_addr;
>
>         /*
>          * Allow 'virtual devices' (e.g., drm) to attach to domain.
> @@ -739,37 +824,22 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
>         if (!iommu)
>                 return 0;
>
> -       ret = rk_iommu_enable_stall(iommu);
> -       if (ret)
> -               return ret;
> -
> -       ret = rk_iommu_force_reset(iommu);
> -       if (ret)
> -               return ret;
> -
> -       iommu->domain = domain;
> -
>         ret = devm_request_irq(dev, iommu->irq, rk_iommu_irq,
>                                IRQF_SHARED, dev_name(dev), iommu);
>         if (ret)
>                 return ret;
>
> -       dte_addr = virt_to_phys(rk_domain->dt);
> -       rk_iommu_write(iommu, RK_MMU_DTE_ADDR, dte_addr);
> -       rk_iommu_command(iommu, RK_MMU_CMD_ZAP_CACHE);
> -       rk_iommu_write(iommu, RK_MMU_INT_MASK, RK_MMU_IRQ_MASK);
> -
> -       ret = rk_iommu_enable_paging(iommu);
> -       if (ret)
> -               return ret;
> -
>         spin_lock_irqsave(&rk_domain->iommus_lock, flags);
>         list_add_tail(&iommu->node, &rk_domain->iommus);
>         spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
>
> -       dev_info(dev, "Attached to iommu domain\n");
> +       spin_lock_irqsave(&iommu->hw_lock, flags);
> +       iommu->domain = domain;
> +       if (iommu->is_powered)
> +               rk_iommu_startup(iommu);
> +       spin_unlock_irqrestore(&iommu->hw_lock, flags);
>
> -       rk_iommu_disable_stall(iommu);
> +       dev_info(dev, "Attached to iommu domain\n");
>
>         return 0;
>  }
> @@ -790,17 +860,14 @@ static void rk_iommu_detach_device(struct iommu_domain *domain,
>         list_del_init(&iommu->node);
>         spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
>
> -       /* Ignore error while disabling, just keep going */
> -       rk_iommu_enable_stall(iommu);
> -       rk_iommu_disable_paging(iommu);
> -       rk_iommu_write(iommu, RK_MMU_INT_MASK, 0);
> -       rk_iommu_write(iommu, RK_MMU_DTE_ADDR, 0);
> -       rk_iommu_disable_stall(iommu);
> +       spin_lock_irqsave(&iommu->hw_lock, flags);
> +       if (iommu->is_powered)
> +               rk_iommu_shutdown(iommu);
> +       iommu->domain = NULL;
> +       spin_unlock_irqrestore(&iommu->hw_lock, flags);
>
>         devm_free_irq(dev, iommu->irq, iommu);
>
> -       iommu->domain = NULL;
> -
>         dev_info(dev, "Detached from iommu domain\n");
>  }
>
> @@ -964,6 +1031,57 @@ static const struct iommu_ops rk_iommu_ops = {
>         .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP,
>  };
>
> +static int rk_iommu_power_on(struct rk_iommu *iommu)
> +{
> +       unsigned long flags;
> +       int ret;
> +
> +       spin_lock_irqsave(&iommu->hw_lock, flags);
> +
> +       iommu->is_powered = true;
> +       if (iommu->domain)
> +               ret = rk_iommu_startup(iommu);
> +
> +       spin_unlock_irqrestore(&iommu->hw_lock, flags);
> +
> +       return ret;
> +}
> +
> +static void rk_iommu_power_off(struct rk_iommu *iommu)
> +{
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&iommu->hw_lock, flags);
> +
> +       if (iommu->domain)
> +               rk_iommu_shutdown(iommu);
> +       iommu->is_powered = false;
> +
> +       spin_unlock_irqrestore(&iommu->hw_lock, flags);
> +}
> +
> +static int rk_iommu_genpd_notify(struct notifier_block *nb,
> +                                unsigned long action, void *data)
> +{
> +       struct rk_iommu *iommu = container_of(nb, struct rk_iommu, genpd_nb);
> +       int ret = 0;
> +
> +       switch (action) {
> +       case PM_GENPD_POST_POWER_ON:
> +               ret = rk_iommu_power_on(iommu);
> +               break;
> +
> +       case PM_GENPD_POWER_OFF_PREPARE:
> +               rk_iommu_power_off(iommu);
> +               break;
> +
> +       default:
> +               return NOTIFY_DONE;
> +       }
> +
> +       return notifier_from_errno(ret);
> +}
> +
>  static int rk_iommu_probe(struct platform_device *pdev)
>  {
>         struct device *dev = &pdev->dev;
> @@ -976,6 +1094,7 @@ static int rk_iommu_probe(struct platform_device *pdev)
>
>         platform_set_drvdata(pdev, iommu);
>         iommu->dev = dev;
> +       spin_lock_init(&iommu->hw_lock);
>
>         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
>         iommu->base = devm_ioremap_resource(&pdev->dev, res);
> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct platform_device *pdev)
>                 return -ENXIO;
>         }
>
> +       pm_runtime_no_callbacks(dev);
> +       pm_runtime_enable(dev);
> +
> +       /* Synchronize state of the domain with driver data. */
> +       pm_runtime_get_sync(dev);
> +       iommu->is_powered = true;

Doesn't the runtime PM status reflect the value of "is_powered", thus
why do you need to have a copy of it? Could it perpahps be that you
try to cope with the case when CONFIG_PM is unset?

> +
> +       iommu->genpd_nb.notifier_call = rk_iommu_genpd_notify;
> +       pm_genpd_register_notifier(dev, &iommu->genpd_nb);
> +
> +       pm_runtime_put(dev);
> +
>         return 0;
>  }
>
>  static int rk_iommu_remove(struct platform_device *pdev)
>  {
> +       struct rk_iommu *iommu = platform_get_drvdata(pdev);
> +
> +       pm_genpd_unregister_notifier(iommu->dev, &iommu->genpd_nb);
> +       pm_runtime_disable(&pdev->dev);
> +
>         return 0;
>  }
>
> --
> 2.2.0.rc0.207.ga3a616c
>

Kind regards
Uffe
Tomasz Figa Dec. 11, 2014, 12:42 p.m. UTC | #2
Hi Ulf,

On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> On 11 December 2014 at 09:26, Tomasz Figa <tfiga@chromium.org> wrote:
>> This patch modifies the rockchip-iommu driver to consider state of
>> the power domain the IOMMU is located in. When the power domain
>> is powered off, the IOMMU cannot be accessed and register programming
>> must be deferred until the power domain becomes enabled.

[snip]

>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct platform_device *pdev)
>>                 return -ENXIO;
>>         }
>>
>> +       pm_runtime_no_callbacks(dev);
>> +       pm_runtime_enable(dev);
>> +
>> +       /* Synchronize state of the domain with driver data. */
>> +       pm_runtime_get_sync(dev);
>> +       iommu->is_powered = true;
>
> Doesn't the runtime PM status reflect the value of "is_powered", thus
> why do you need to have a copy of it? Could it perpahps be that you
> try to cope with the case when CONFIG_PM is unset?
>

It's worth noting that this driver fully relies on status of other
devices in the power domain the IOMMU is in and does not enforce the
status on its own. So in general, as far as my understanding of PM
runtime subsystem, the status of the IOMMU device will be always
suspended, because nobody will call pm_runtime_get() on it (except the
get and put pair in probe). So is_powered is here to track status of
the domain, not the device. Feel free to suggest a better way, though.

Best regards,
Tomasz
Ulf Hansson Dec. 11, 2014, 3:22 p.m. UTC | #3
On 11 December 2014 at 13:42, Tomasz Figa <tfiga@chromium.org> wrote:
> Hi Ulf,
>
> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson <ulf.hansson@linaro.org> wrote:
>> On 11 December 2014 at 09:26, Tomasz Figa <tfiga@chromium.org> wrote:
>>> This patch modifies the rockchip-iommu driver to consider state of
>>> the power domain the IOMMU is located in. When the power domain
>>> is powered off, the IOMMU cannot be accessed and register programming
>>> must be deferred until the power domain becomes enabled.
>
> [snip]
>
>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct platform_device *pdev)
>>>                 return -ENXIO;
>>>         }
>>>
>>> +       pm_runtime_no_callbacks(dev);
>>> +       pm_runtime_enable(dev);
>>> +
>>> +       /* Synchronize state of the domain with driver data. */
>>> +       pm_runtime_get_sync(dev);
>>> +       iommu->is_powered = true;
>>
>> Doesn't the runtime PM status reflect the value of "is_powered", thus
>> why do you need to have a copy of it? Could it perpahps be that you
>> try to cope with the case when CONFIG_PM is unset?
>>
>
> It's worth noting that this driver fully relies on status of other
> devices in the power domain the IOMMU is in and does not enforce the
> status on its own. So in general, as far as my understanding of PM
> runtime subsystem, the status of the IOMMU device will be always
> suspended, because nobody will call pm_runtime_get() on it (except the
> get and put pair in probe). So is_powered is here to track status of
> the domain, not the device. Feel free to suggest a better way, though.

I see, thanks for clarifying. I think it makes sense as is, I have no
better suggestion.

Kind regards
Uffe
Kevin Hilman Dec. 11, 2014, 3:31 p.m. UTC | #4
[+ Laurent Pinchart]

Tomasz Figa <tfiga@chromium.org> writes:

> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson <ulf.hansson@linaro.org> wrote:

[...]

>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct platform_device *pdev)
>>>                 return -ENXIO;
>>>         }
>>>
>>> +       pm_runtime_no_callbacks(dev);
>>> +       pm_runtime_enable(dev);
>>> +
>>> +       /* Synchronize state of the domain with driver data. */
>>> +       pm_runtime_get_sync(dev);
>>> +       iommu->is_powered = true;
>>
>> Doesn't the runtime PM status reflect the value of "is_powered", thus
>> why do you need to have a copy of it? Could it perpahps be that you
>> try to cope with the case when CONFIG_PM is unset?
>>
>
> It's worth noting that this driver fully relies on status of other
> devices in the power domain the IOMMU is in and does not enforce the
> status on its own. So in general, as far as my understanding of PM
> runtime subsystem, the status of the IOMMU device will be always
> suspended, because nobody will call pm_runtime_get() on it (except the
> get and put pair in probe). So is_powered is here to track status of
> the domain, not the device. Feel free to suggest a better way, though.

I still don't like these notifiers.  I think they add ways to bypass
having proper runtime PM implemented for devices/subsystems.

From a high-level, the IOMMU is just another device inside the PM
domain, so ideally it should be doing it's own _get() and _put() calls
so the PM domain code would just do the right thing without the need for
notifiers.

No knowing a lot about the IOMMU API, I'm guessing the reason you're not
doing that is because the IOMMU API currently doesn't have an easy way
to keep track of *active* users so it's not obvious where to put those
_get and _put calls.  If that doesn't exist, perhaps a simple
iommu_get() and iommu_put() API needs to be introduced (which inside the
IOMMU core would just do runtime PM calls) so that users of the IOMMU
could inform the subsystem that the IOMMU is needed and it should not be
powered off.

I Cc'd Laurent because I know he's thought about this before from the
IOMMU side, and not sure if he came up with a solution.

Kevin
Ulf Hansson Dec. 11, 2014, 3:51 p.m. UTC | #5
On 11 December 2014 at 16:31, Kevin Hilman <khilman@kernel.org> wrote:
> [+ Laurent Pinchart]
>
> Tomasz Figa <tfiga@chromium.org> writes:
>
>> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson <ulf.hansson@linaro.org> wrote:
>
> [...]
>
>>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct platform_device *pdev)
>>>>                 return -ENXIO;
>>>>         }
>>>>
>>>> +       pm_runtime_no_callbacks(dev);
>>>> +       pm_runtime_enable(dev);
>>>> +
>>>> +       /* Synchronize state of the domain with driver data. */
>>>> +       pm_runtime_get_sync(dev);
>>>> +       iommu->is_powered = true;
>>>
>>> Doesn't the runtime PM status reflect the value of "is_powered", thus
>>> why do you need to have a copy of it? Could it perpahps be that you
>>> try to cope with the case when CONFIG_PM is unset?
>>>
>>
>> It's worth noting that this driver fully relies on status of other
>> devices in the power domain the IOMMU is in and does not enforce the
>> status on its own. So in general, as far as my understanding of PM
>> runtime subsystem, the status of the IOMMU device will be always
>> suspended, because nobody will call pm_runtime_get() on it (except the
>> get and put pair in probe). So is_powered is here to track status of
>> the domain, not the device. Feel free to suggest a better way, though.
>
> I still don't like these notifiers.  I think they add ways to bypass
> having proper runtime PM implemented for devices/subsystems.

I do agree, but I haven't found another good solution to the problem.

>
> From a high-level, the IOMMU is just another device inside the PM
> domain, so ideally it should be doing it's own _get() and _put() calls
> so the PM domain code would just do the right thing without the need for
> notifiers.

As I understand it, the IOMMU (or for other similar cases) shouldn't
be doing any get() and put() at all because there are no IO API to
serve request from.

In principle we could consider these kind devices as "parent" devices
to those other devices that needs them. Then runtime PM core would
take care of things for us, right!?

Now, I am not so sure using the "parent" approach is actually viable,
since it will likely have other complications, but I haven't
thoroughly thought it though yet.

>
> No knowing a lot about the IOMMU API, I'm guessing the reason you're not
> doing that is because the IOMMU API currently doesn't have an easy way
> to keep track of *active* users so it's not obvious where to put those
> _get and _put calls.  If that doesn't exist, perhaps a simple
> iommu_get() and iommu_put() API needs to be introduced (which inside the
> IOMMU core would just do runtime PM calls) so that users of the IOMMU
> could inform the subsystem that the IOMMU is needed and it should not be
> powered off.
>
> I Cc'd Laurent because I know he's thought about this before from the
> IOMMU side, and not sure if he came up with a solution.

Cool, let's see then.

Kind regards
Uffe
Rafael J. Wysocki Dec. 11, 2014, 8:48 p.m. UTC | #6
On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
> On 11 December 2014 at 16:31, Kevin Hilman <khilman@kernel.org> wrote:
> > [+ Laurent Pinchart]
> >
> > Tomasz Figa <tfiga@chromium.org> writes:
> >
> >> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson <ulf.hansson@linaro.org> wrote:
> >
> > [...]
> >
> >>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct platform_device *pdev)
> >>>>                 return -ENXIO;
> >>>>         }
> >>>>
> >>>> +       pm_runtime_no_callbacks(dev);
> >>>> +       pm_runtime_enable(dev);
> >>>> +
> >>>> +       /* Synchronize state of the domain with driver data. */
> >>>> +       pm_runtime_get_sync(dev);
> >>>> +       iommu->is_powered = true;
> >>>
> >>> Doesn't the runtime PM status reflect the value of "is_powered", thus
> >>> why do you need to have a copy of it? Could it perpahps be that you
> >>> try to cope with the case when CONFIG_PM is unset?
> >>>
> >>
> >> It's worth noting that this driver fully relies on status of other
> >> devices in the power domain the IOMMU is in and does not enforce the
> >> status on its own. So in general, as far as my understanding of PM
> >> runtime subsystem, the status of the IOMMU device will be always
> >> suspended, because nobody will call pm_runtime_get() on it (except the
> >> get and put pair in probe). So is_powered is here to track status of
> >> the domain, not the device. Feel free to suggest a better way, though.
> >
> > I still don't like these notifiers.  I think they add ways to bypass
> > having proper runtime PM implemented for devices/subsystems.
> 
> I do agree, but I haven't found another good solution to the problem.

For the record, I'm not liking this mostly because it "fixes" a generic problem
in a way that's hidden in the genpd code and very indirect.

> > From a high-level, the IOMMU is just another device inside the PM
> > domain, so ideally it should be doing it's own _get() and _put() calls
> > so the PM domain code would just do the right thing without the need for
> > notifiers.
> 
> As I understand it, the IOMMU (or for other similar cases) shouldn't
> be doing any get() and put() at all because there are no IO API to
> serve request from.
> 
> In principle we could consider these kind devices as "parent" devices
> to those other devices that needs them. Then runtime PM core would
> take care of things for us, right!?
> 
> Now, I am not so sure using the "parent" approach is actually viable,
> since it will likely have other complications, but I haven't
> thoroughly thought it though yet.

That actually need not be a "parent".

What's needed in this case is to do a pm_runtime_get_sync() on a device
depended on every time a dependent device is runtime-resumed (and analogously
for suspending).

The core doesn't have a way to do that, but it looks like we'll need to add
it anyway for various reasons (ACPI _DEP is one of them as I mentioned some
time ago, but people dismissed it basically as not their problem).
Tomasz Figa Dec. 12, 2014, 4:15 a.m. UTC | #7
Hi Rafael,

On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
>> On 11 December 2014 at 16:31, Kevin Hilman <khilman@kernel.org> wrote:
>> > [+ Laurent Pinchart]
>> >
>> > Tomasz Figa <tfiga@chromium.org> writes:
>> >
>> >> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson <ulf.hansson@linaro.org> wrote:
>> >
>> > [...]
>> >
>> >>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct platform_device *pdev)
>> >>>>                 return -ENXIO;
>> >>>>         }
>> >>>>
>> >>>> +       pm_runtime_no_callbacks(dev);
>> >>>> +       pm_runtime_enable(dev);
>> >>>> +
>> >>>> +       /* Synchronize state of the domain with driver data. */
>> >>>> +       pm_runtime_get_sync(dev);
>> >>>> +       iommu->is_powered = true;
>> >>>
>> >>> Doesn't the runtime PM status reflect the value of "is_powered", thus
>> >>> why do you need to have a copy of it? Could it perpahps be that you
>> >>> try to cope with the case when CONFIG_PM is unset?
>> >>>
>> >>
>> >> It's worth noting that this driver fully relies on status of other
>> >> devices in the power domain the IOMMU is in and does not enforce the
>> >> status on its own. So in general, as far as my understanding of PM
>> >> runtime subsystem, the status of the IOMMU device will be always
>> >> suspended, because nobody will call pm_runtime_get() on it (except the
>> >> get and put pair in probe). So is_powered is here to track status of
>> >> the domain, not the device. Feel free to suggest a better way, though.
>> >
>> > I still don't like these notifiers.  I think they add ways to bypass
>> > having proper runtime PM implemented for devices/subsystems.
>>
>> I do agree, but I haven't found another good solution to the problem.
>
> For the record, I'm not liking this mostly because it "fixes" a generic problem
> in a way that's hidden in the genpd code and very indirect.

Well, that's true. This is indeed a generic problem of PM dependencies
between devices (other than those represented by parent-child
relation), which in fact doesn't have much to do with genpd, but
rather with those devices directly. It is just that genpd is the most
convenient location to solve this in current code and in a simple way.
In other words, I see this solution as a reasonable way to get the
problem solved quickly for now, so that we can start thinking about a
more elegant solution.

>
>> > From a high-level, the IOMMU is just another device inside the PM
>> > domain, so ideally it should be doing it's own _get() and _put() calls
>> > so the PM domain code would just do the right thing without the need for
>> > notifiers.
>>
>> As I understand it, the IOMMU (or for other similar cases) shouldn't
>> be doing any get() and put() at all because there are no IO API to
>> serve request from.
>>
>> In principle we could consider these kind devices as "parent" devices
>> to those other devices that needs them. Then runtime PM core would
>> take care of things for us, right!?
>>
>> Now, I am not so sure using the "parent" approach is actually viable,
>> since it will likely have other complications, but I haven't
>> thoroughly thought it though yet.
>
> That actually need not be a "parent".
>
> What's needed in this case is to do a pm_runtime_get_sync() on a device
> depended on every time a dependent device is runtime-resumed (and analogously
> for suspending).
>
> The core doesn't have a way to do that, but it looks like we'll need to add
> it anyway for various reasons (ACPI _DEP is one of them as I mentioned some
> time ago, but people dismissed it basically as not their problem).

Let me show you our exact use case.

We have a power domain, which contains an IOMMU and an IP block, which
can do bus transactions through that IOMMU. Of course the IP block is
not aware of the IOMMU, because this is just an integration detail and
on other platforms using the same IP block the IOMMU might not be
there. This implies that the driver for this IP block should not be
aware of the IOMMU either, which, on the buffer allocation and mapping
side, is achieved by DMA mapping subsystem. We would also want the
IOMMU to be fully transparent to that driver on PM side.

Now, for PM related details, they are located in the same power
domain, which means that whenever the power domain is turned off, the
CPU can't access their registers and all the hardware state is gone.
To make the case more interesting, there is no point in programming
the IOMMU unless the device using it is powered on. Similarly, there
is no point in powering the domain on to just access the IOMMU. This
implies that the power domain should be fully controlled by the
stand-alone IP block, while the peripheral IOMMU shouldn't affect its
state and its driver only respond to operations performed by driver of
that stand-alone IP block.

A solution like below could work for the above:

- There is a per-device list of peripheral devices, which need to be
powered on for this device to work.
- Whenever the IOMMU subsystem/driver binds an IOMMU to a device, it
adds the IOMMU device to the list of peripheral devices of that device
(and similarly for removal).
- A runtime PM operation on a device will also perform the same
operation on all its peripheral devices.

Another way would be to extend what the PM runtime core does with
parent-child relations to handle the whole list of peripheral devices
instead of just the parent.

Would this design sound somehow reasonable to you?

Best regards,
Tomasz
Kevin Hilman Dec. 12, 2014, 8:04 p.m. UTC | #8
Tomasz Figa <tfiga@chromium.org> writes:

[...]

> We have a power domain, which contains an IOMMU and an IP block, which
> can do bus transactions through that IOMMU. Of course the IP block is
> not aware of the IOMMU, because this is just an integration detail and
> on other platforms using the same IP block the IOMMU might not be
> there. This implies that the driver for this IP block should not be
> aware of the IOMMU either, which, on the buffer allocation and mapping
> side, is achieved by DMA mapping subsystem. We would also want the
> IOMMU to be fully transparent to that driver on PM side.

An even more exciting problem exists when a CPU is in the same domain as
other peripherals, those peripherals are all idle and the power domain
is gated. :)

> Now, for PM related details, they are located in the same power
> domain, which means that whenever the power domain is turned off, the
> CPU can't access their registers and all the hardware state is gone.
> To make the case more interesting, there is no point in programming
> the IOMMU unless the device using it is powered on. Similarly, there
> is no point in powering the domain on to just access the IOMMU. This
> implies that the power domain should be fully controlled by the
> stand-alone IP block, while the peripheral IOMMU shouldn't affect its
> state and its driver only respond to operations performed by driver of
> that stand-alone IP block.

Which implies that the IOMMU driver should have it's own set of
runtime_suspend/runtime_resume callbacks to properly save/restore state
as well.

> A solution like below could work for the above:
>
> - There is a per-device list of peripheral devices, which need to be
> powered on for this device to work.
> - Whenever the IOMMU subsystem/driver binds an IOMMU to a device, it
> adds the IOMMU device to the list of peripheral devices of that device
> (and similarly for removal).
> - A runtime PM operation on a device will also perform the same
> operation on all its peripheral devices.

Yes, that is exactly what we need.

I'd rather use the term "dependent" devices rather than peripheral
devices, but otherwise it sounds like the right approach to me.

Kevin

> Another way would be to extend what the PM runtime core does with
> parent-child relations to handle the whole list of peripheral devices
> instead of just the parent.
Laurent Pinchart Dec. 12, 2014, 8:47 p.m. UTC | #9
Hello,

On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
> > On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
> >> On 11 December 2014 at 16:31, Kevin Hilman <khilman@kernel.org> wrote:
> >> > [+ Laurent Pinchart]
> >> > 
> >> > Tomasz Figa <tfiga@chromium.org> writes:
> >> >> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson wrote:
> >> > [...]
> >> > 
> >> >>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct
> >> >>>> platform_device *pdev)>> >>>> 
> >> >>>>                 return -ENXIO;
> >> >>>>         
> >> >>>>         }
> >> >>>> 
> >> >>>> +       pm_runtime_no_callbacks(dev);
> >> >>>> +       pm_runtime_enable(dev);
> >> >>>> +
> >> >>>> +       /* Synchronize state of the domain with driver data. */
> >> >>>> +       pm_runtime_get_sync(dev);
> >> >>>> +       iommu->is_powered = true;
> >> >>> 
> >> >>> Doesn't the runtime PM status reflect the value of "is_powered", thus
> >> >>> why do you need to have a copy of it? Could it perpahps be that you
> >> >>> try to cope with the case when CONFIG_PM is unset?
> >> >> 
> >> >> It's worth noting that this driver fully relies on status of other
> >> >> devices in the power domain the IOMMU is in and does not enforce the
> >> >> status on its own. So in general, as far as my understanding of PM
> >> >> runtime subsystem, the status of the IOMMU device will be always
> >> >> suspended, because nobody will call pm_runtime_get() on it (except the
> >> >> get and put pair in probe). So is_powered is here to track status of
> >> >> the domain, not the device. Feel free to suggest a better way, though.
> >> > 
> >> > I still don't like these notifiers.  I think they add ways to bypass
> >> > having proper runtime PM implemented for devices/subsystems.
> >> 
> >> I do agree, but I haven't found another good solution to the problem.
> > 
> > For the record, I'm not liking this mostly because it "fixes" a generic
> > problem in a way that's hidden in the genpd code and very indirect.
> 
> Well, that's true. This is indeed a generic problem of PM dependencies
> between devices (other than those represented by parent-child
> relation), which in fact doesn't have much to do with genpd, but
> rather with those devices directly. It is just that genpd is the most
> convenient location to solve this in current code and in a simple way.
> In other words, I see this solution as a reasonable way to get the
> problem solved quickly for now, so that we can start thinking about a
> more elegant solution.
> 
> >> > From a high-level, the IOMMU is just another device inside the PM
> >> > domain, so ideally it should be doing it's own _get() and _put() calls
> >> > so the PM domain code would just do the right thing without the need
> >> > for notifiers.
> >> 
> >> As I understand it, the IOMMU (or for other similar cases) shouldn't
> >> be doing any get() and put() at all because there are no IO API to
> >> serve request from.

Speaking purely from an IOMMU point of view that's not entirely tree. IOMMU 
drivers expose map and unmap operations, so they can track whether any memory 
is mapped. From a bus master point of view the IOMMU map and unmap operations 
are hidden by the DMA mapping API. The IOMMU can thus track the existence of 
mappings without any IOMMU awareness in the bus master driver.

If no mapping exist the IOMMU shouldn't receive any translation request. An 
IOMMU driver could thus call pm_runtime_get_sync() in the map handler when 
creating the first mapping, and pm_runtime_put() in the unmap handler when 
tearing the last mapping down.

One could argue that the IOMMU would end up being powered more often than 
strictly needed, as bus masters drivers, even when written properly, could 
keep mappings around at times they don't perform bus access. This is true, and 
that's an argument I've raised during the last kernel summit. The general 
response (including Linus Torvald's) was that micro-optimizing power 
management might not be worth it, and that measurements proving that the gain 
is worth it are required before introducing new APIs to solve the problem. I 
can't disagree with that argument.

> >> In principle we could consider these kind devices as "parent" devices
> >> to those other devices that needs them. Then runtime PM core would
> >> take care of things for us, right!?
> >> 
> >> Now, I am not so sure using the "parent" approach is actually viable,
> >> since it will likely have other complications, but I haven't
> >> thoroughly thought it though yet.
> > 
> > That actually need not be a "parent".
> > 
> > What's needed in this case is to do a pm_runtime_get_sync() on a device
> > depended on every time a dependent device is runtime-resumed (and
> > analogously for suspending).
> > 
> > The core doesn't have a way to do that, but it looks like we'll need to
> > add it anyway for various reasons (ACPI _DEP is one of them as I mentioned
> > some time ago, but people dismissed it basically as not their problem).
> 
> Let me show you our exact use case.
> 
> We have a power domain, which contains an IOMMU and an IP block, which
> can do bus transactions through that IOMMU. Of course the IP block is
> not aware of the IOMMU, because this is just an integration detail and
> on other platforms using the same IP block the IOMMU might not be
> there. This implies that the driver for this IP block should not be
> aware of the IOMMU either, which, on the buffer allocation and mapping
> side, is achieved by DMA mapping subsystem. We would also want the
> IOMMU to be fully transparent to that driver on PM side.
> 
> Now, for PM related details, they are located in the same power
> domain, which means that whenever the power domain is turned off, the
> CPU can't access their registers and all the hardware state is gone.
> To make the case more interesting, there is no point in programming
> the IOMMU unless the device using it is powered on. Similarly, there
> is no point in powering the domain on to just access the IOMMU. This
> implies that the power domain should be fully controlled by the
> stand-alone IP block, while the peripheral IOMMU shouldn't affect its
> state and its driver only respond to operations performed by driver of
> that stand-alone IP block.
> 
> A solution like below could work for the above:
> 
> - There is a per-device list of peripheral devices, which need to be
> powered on for this device to work.
> - Whenever the IOMMU subsystem/driver binds an IOMMU to a device, it
> adds the IOMMU device to the list of peripheral devices of that device
> (and similarly for removal).
> - A runtime PM operation on a device will also perform the same
> operation on all its peripheral devices.
> 
> Another way would be to extend what the PM runtime core does with
> parent-child relations to handle the whole list of peripheral devices
> instead of just the parent.
> 
> Would this design sound somehow reasonable to you?
Tomasz Figa Dec. 15, 2014, 2:32 a.m. UTC | #10
On Sat, Dec 13, 2014 at 5:04 AM, Kevin Hilman <khilman@kernel.org> wrote:
> Tomasz Figa <tfiga@chromium.org> writes:
>
> [...]
>
>> We have a power domain, which contains an IOMMU and an IP block, which
>> can do bus transactions through that IOMMU. Of course the IP block is
>> not aware of the IOMMU, because this is just an integration detail and
>> on other platforms using the same IP block the IOMMU might not be
>> there. This implies that the driver for this IP block should not be
>> aware of the IOMMU either, which, on the buffer allocation and mapping
>> side, is achieved by DMA mapping subsystem. We would also want the
>> IOMMU to be fully transparent to that driver on PM side.
>
> An even more exciting problem exists when a CPU is in the same domain as
> other peripherals, those peripherals are all idle and the power domain
> is gated. :)
>

Definitely an exciting problem. :)

Although I think it's quite opposite to ours, because the kernel knows
when the CPU is online and can simply get() the CPU device from one of
the running CPUs when onlining it and put() when offlining. The same
can't be said about IOMMUs, which are essentially bus interfaces for
other IP blocks, not really standalone devices (more than being
standalone IPs).

>> Now, for PM related details, they are located in the same power
>> domain, which means that whenever the power domain is turned off, the
>> CPU can't access their registers and all the hardware state is gone.
>> To make the case more interesting, there is no point in programming
>> the IOMMU unless the device using it is powered on. Similarly, there
>> is no point in powering the domain on to just access the IOMMU. This
>> implies that the power domain should be fully controlled by the
>> stand-alone IP block, while the peripheral IOMMU shouldn't affect its
>> state and its driver only respond to operations performed by driver of
>> that stand-alone IP block.
>
> Which implies that the IOMMU driver should have it's own set of
> runtime_suspend/runtime_resume callbacks to properly save/restore state
> as well.

Yes, but someone needs to be calling them. Right now when genpd is
used, put()ting the last runtime active device in a domain will
trigger domain power off and calling .suspend() callbacks, but
.resume() callback of each device will be called only on first get()
of this device. This means that get()ting e.g. an video codec device
will not cause the IOMMU's .resume() callback to be called.

>
>> A solution like below could work for the above:
>>
>> - There is a per-device list of peripheral devices, which need to be
>> powered on for this device to work.
>> - Whenever the IOMMU subsystem/driver binds an IOMMU to a device, it
>> adds the IOMMU device to the list of peripheral devices of that device
>> (and similarly for removal).
>> - A runtime PM operation on a device will also perform the same
>> operation on all its peripheral devices.
>
> Yes, that is exactly what we need.
>
> I'd rather use the term "dependent" devices rather than peripheral
> devices, but otherwise it sounds like the right approach to me.

I will leave the terminology to you and other people. I'm not good at
inventing names. :)

Best regards,
Tomasz
Tomasz Figa Dec. 15, 2014, 2:39 a.m. UTC | #11
On Sat, Dec 13, 2014 at 5:47 AM, Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
> Hello,
>
> On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
>> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
>> > On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
>> >> On 11 December 2014 at 16:31, Kevin Hilman <khilman@kernel.org> wrote:
>> >> > [+ Laurent Pinchart]
>> >> >
>> >> > Tomasz Figa <tfiga@chromium.org> writes:
>> >> >> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson wrote:
>> >> > [...]
>> >> >
>> >> >>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct
>> >> >>>> platform_device *pdev)>> >>>>
>> >> >>>>                 return -ENXIO;
>> >> >>>>
>> >> >>>>         }
>> >> >>>>
>> >> >>>> +       pm_runtime_no_callbacks(dev);
>> >> >>>> +       pm_runtime_enable(dev);
>> >> >>>> +
>> >> >>>> +       /* Synchronize state of the domain with driver data. */
>> >> >>>> +       pm_runtime_get_sync(dev);
>> >> >>>> +       iommu->is_powered = true;
>> >> >>>
>> >> >>> Doesn't the runtime PM status reflect the value of "is_powered", thus
>> >> >>> why do you need to have a copy of it? Could it perpahps be that you
>> >> >>> try to cope with the case when CONFIG_PM is unset?
>> >> >>
>> >> >> It's worth noting that this driver fully relies on status of other
>> >> >> devices in the power domain the IOMMU is in and does not enforce the
>> >> >> status on its own. So in general, as far as my understanding of PM
>> >> >> runtime subsystem, the status of the IOMMU device will be always
>> >> >> suspended, because nobody will call pm_runtime_get() on it (except the
>> >> >> get and put pair in probe). So is_powered is here to track status of
>> >> >> the domain, not the device. Feel free to suggest a better way, though.
>> >> >
>> >> > I still don't like these notifiers.  I think they add ways to bypass
>> >> > having proper runtime PM implemented for devices/subsystems.
>> >>
>> >> I do agree, but I haven't found another good solution to the problem.
>> >
>> > For the record, I'm not liking this mostly because it "fixes" a generic
>> > problem in a way that's hidden in the genpd code and very indirect.
>>
>> Well, that's true. This is indeed a generic problem of PM dependencies
>> between devices (other than those represented by parent-child
>> relation), which in fact doesn't have much to do with genpd, but
>> rather with those devices directly. It is just that genpd is the most
>> convenient location to solve this in current code and in a simple way.
>> In other words, I see this solution as a reasonable way to get the
>> problem solved quickly for now, so that we can start thinking about a
>> more elegant solution.
>>
>> >> > From a high-level, the IOMMU is just another device inside the PM
>> >> > domain, so ideally it should be doing it's own _get() and _put() calls
>> >> > so the PM domain code would just do the right thing without the need
>> >> > for notifiers.
>> >>
>> >> As I understand it, the IOMMU (or for other similar cases) shouldn't
>> >> be doing any get() and put() at all because there are no IO API to
>> >> serve request from.
>
> Speaking purely from an IOMMU point of view that's not entirely tree. IOMMU
> drivers expose map and unmap operations, so they can track whether any memory
> is mapped. From a bus master point of view the IOMMU map and unmap operations
> are hidden by the DMA mapping API. The IOMMU can thus track the existence of
> mappings without any IOMMU awareness in the bus master driver.
>
> If no mapping exist the IOMMU shouldn't receive any translation request. An
> IOMMU driver could thus call pm_runtime_get_sync() in the map handler when
> creating the first mapping, and pm_runtime_put() in the unmap handler when
> tearing the last mapping down.
>
> One could argue that the IOMMU would end up being powered more often than
> strictly needed, as bus masters drivers, even when written properly, could
> keep mappings around at times they don't perform bus access. This is true, and
> that's an argument I've raised during the last kernel summit. The general
> response (including Linus Torvald's) was that micro-optimizing power
> management might not be worth it, and that measurements proving that the gain
> is worth it are required before introducing new APIs to solve the problem. I
> can't disagree with that argument.
>

This would be a micro optimization if the IOMMU was located in its own
power domain. Unfortunately in most cases it is not, so keeping all
the devices in the domain powered on, because one of them have a
mapping created doesn't sound like a good idea.

Moreover, most of the drivers will keep the mapping for much longer
than one run cycle. Please take a look at V4L2's videobuf2 subsystem
(which I guess you are more familiar with than me;)), which will keep
MMAP buffers mapped in IOMMU address space for their whole lifetime. I
believe similar is the case for DRM drivers.

Best regards,
Tomasz
Geert Uytterhoeven Dec. 15, 2014, 8:35 a.m. UTC | #12
On Fri, Dec 12, 2014 at 9:04 PM, Kevin Hilman <khilman@kernel.org> wrote:
> An even more exciting problem exists when a CPU is in the same domain as
> other peripherals, those peripherals are all idle and the power domain
> is gated. :)

We do have pm_genpd_attach_cpuidle() and pm_genpd_name_attach_cpuidle()?

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
Kevin Hilman Dec. 15, 2014, 6:06 p.m. UTC | #13
Geert Uytterhoeven <geert@linux-m68k.org> writes:

> On Fri, Dec 12, 2014 at 9:04 PM, Kevin Hilman <khilman@kernel.org> wrote:
>> An even more exciting problem exists when a CPU is in the same domain as
>> other peripherals, those peripherals are all idle and the power domain
>> is gated. :)
>
> We do have pm_genpd_attach_cpuidle() and pm_genpd_name_attach_cpuidle()?

Exactly.  

And I think to solve this problem, we need to generalize the attaching
of dependent devices to a PM domain.

Kevin
Laurent Pinchart Dec. 15, 2014, 7:53 p.m. UTC | #14
Hi Tomasz,

On Monday 15 December 2014 11:39:01 Tomasz Figa wrote:
> On Sat, Dec 13, 2014 at 5:47 AM, Laurent Pinchart wrote:
> > On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
> >> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
> >>> On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
> >>>> On 11 December 2014 at 16:31, Kevin Hilman <khilman@kernel.org> wrote:
> >>>>> [+ Laurent Pinchart]
> >>>>> 
> >>>>> Tomasz Figa <tfiga@chromium.org> writes:
> >>>>>> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson wrote:
> >>>>> [...]
> >>>>> 
> >>>>>>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct
> >>>>>>>> platform_device *pdev)
> >>>>>>>>                 return -ENXIO;
> >>>>>>>>         }
> >>>>>>>> 
> >>>>>>>> +       pm_runtime_no_callbacks(dev);
> >>>>>>>> +       pm_runtime_enable(dev);
> >>>>>>>> +
> >>>>>>>> +       /* Synchronize state of the domain with driver data. */
> >>>>>>>> +       pm_runtime_get_sync(dev);
> >>>>>>>> +       iommu->is_powered = true;
> >>>>>>> 
> >>>>>>> Doesn't the runtime PM status reflect the value of "is_powered",
> >>>>>>> thus why do you need to have a copy of it? Could it perpahps be
> >>>>>>> that you try to cope with the case when CONFIG_PM is unset?
> >>>>>> 
> >>>>>> It's worth noting that this driver fully relies on status of other
> >>>>>> devices in the power domain the IOMMU is in and does not enforce
> >>>>>> the status on its own. So in general, as far as my understanding of
> >>>>>> PM runtime subsystem, the status of the IOMMU device will be always
> >>>>>> suspended, because nobody will call pm_runtime_get() on it (except
> >>>>>> the get and put pair in probe). So is_powered is here to track
> >>>>>> status of the domain, not the device. Feel free to suggest a better
> >>>>>> way, though.
> >>>>> 
> >>>>> I still don't like these notifiers.  I think they add ways to bypass
> >>>>> having proper runtime PM implemented for devices/subsystems.
> >>>> 
> >>>> I do agree, but I haven't found another good solution to the problem.
> >>> 
> >>> For the record, I'm not liking this mostly because it "fixes" a generic
> >>> problem in a way that's hidden in the genpd code and very indirect.
> >> 
> >> Well, that's true. This is indeed a generic problem of PM dependencies
> >> between devices (other than those represented by parent-child
> >> relation), which in fact doesn't have much to do with genpd, but
> >> rather with those devices directly. It is just that genpd is the most
> >> convenient location to solve this in current code and in a simple way.
> >> In other words, I see this solution as a reasonable way to get the
> >> problem solved quickly for now, so that we can start thinking about a
> >> more elegant solution.
> >> 
> >> >> > From a high-level, the IOMMU is just another device inside the PM
> >> >> > domain, so ideally it should be doing it's own _get() and _put()
> >> >> > calls so the PM domain code would just do the right thing without
> >> >> > the need for notifiers.
> >> >> 
> >> >> As I understand it, the IOMMU (or for other similar cases) shouldn't
> >> >> be doing any get() and put() at all because there are no IO API to
> >> >> serve request from.
> > 
> > Speaking purely from an IOMMU point of view that's not entirely tree.
> > IOMMU drivers expose map and unmap operations, so they can track whether
> > any memory is mapped. From a bus master point of view the IOMMU map and
> > unmap operations are hidden by the DMA mapping API. The IOMMU can thus
> > track the existence of mappings without any IOMMU awareness in the bus
> > master driver.
> > 
> > If no mapping exist the IOMMU shouldn't receive any translation request.
> > An IOMMU driver could thus call pm_runtime_get_sync() in the map handler
> > when creating the first mapping, and pm_runtime_put() in the unmap handler
> > when tearing the last mapping down.
> > 
> > One could argue that the IOMMU would end up being powered more often than
> > strictly needed, as bus masters drivers, even when written properly, could
> > keep mappings around at times they don't perform bus access. This is true,
> > and that's an argument I've raised during the last kernel summit. The
> > general response (including Linus Torvald's) was that micro-optimizing
> > power management might not be worth it, and that measurements proving
> > that the gain is worth it are required before introducing new APIs to
> > solve the problem. I can't disagree with that argument.
> 
> This would be a micro optimization if the IOMMU was located in its own
> power domain. Unfortunately in most cases it is not, so keeping all
> the devices in the domain powered on, because one of them have a
> mapping created doesn't sound like a good idea.
> 
> Moreover, most of the drivers will keep the mapping for much longer
> than one run cycle. Please take a look at V4L2's videobuf2 subsystem
> (which I guess you are more familiar with than me;)), which will keep
> MMAP buffers mapped in IOMMU address space for their whole lifetime. I
> believe similar is the case for DRM drivers.

Yes, but that doesn't mean it gets out of control. Buffers shouldn't be 
allocated if they won't be used. Granted, they could be preallocated (or 
rather premapped) slightly before being used, but in sane use cases that 
shouldn't be long before the hardware needs to be turned on.
Tomasz Figa Dec. 16, 2014, 2:18 a.m. UTC | #15
On Tue, Dec 16, 2014 at 4:53 AM, Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
> Hi Tomasz,
>
> On Monday 15 December 2014 11:39:01 Tomasz Figa wrote:
>> On Sat, Dec 13, 2014 at 5:47 AM, Laurent Pinchart wrote:
>> > On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
>> >> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
>> >>> On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
>> >>>> On 11 December 2014 at 16:31, Kevin Hilman <khilman@kernel.org> wrote:
>> >>>>> [+ Laurent Pinchart]
>> >>>>>
>> >>>>> Tomasz Figa <tfiga@chromium.org> writes:
>> >>>>>> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson wrote:
>> >>>>> [...]
>> >>>>>
>> >>>>>>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct
>> >>>>>>>> platform_device *pdev)
>> >>>>>>>>                 return -ENXIO;
>> >>>>>>>>         }
>> >>>>>>>>
>> >>>>>>>> +       pm_runtime_no_callbacks(dev);
>> >>>>>>>> +       pm_runtime_enable(dev);
>> >>>>>>>> +
>> >>>>>>>> +       /* Synchronize state of the domain with driver data. */
>> >>>>>>>> +       pm_runtime_get_sync(dev);
>> >>>>>>>> +       iommu->is_powered = true;
>> >>>>>>>
>> >>>>>>> Doesn't the runtime PM status reflect the value of "is_powered",
>> >>>>>>> thus why do you need to have a copy of it? Could it perpahps be
>> >>>>>>> that you try to cope with the case when CONFIG_PM is unset?
>> >>>>>>
>> >>>>>> It's worth noting that this driver fully relies on status of other
>> >>>>>> devices in the power domain the IOMMU is in and does not enforce
>> >>>>>> the status on its own. So in general, as far as my understanding of
>> >>>>>> PM runtime subsystem, the status of the IOMMU device will be always
>> >>>>>> suspended, because nobody will call pm_runtime_get() on it (except
>> >>>>>> the get and put pair in probe). So is_powered is here to track
>> >>>>>> status of the domain, not the device. Feel free to suggest a better
>> >>>>>> way, though.
>> >>>>>
>> >>>>> I still don't like these notifiers.  I think they add ways to bypass
>> >>>>> having proper runtime PM implemented for devices/subsystems.
>> >>>>
>> >>>> I do agree, but I haven't found another good solution to the problem.
>> >>>
>> >>> For the record, I'm not liking this mostly because it "fixes" a generic
>> >>> problem in a way that's hidden in the genpd code and very indirect.
>> >>
>> >> Well, that's true. This is indeed a generic problem of PM dependencies
>> >> between devices (other than those represented by parent-child
>> >> relation), which in fact doesn't have much to do with genpd, but
>> >> rather with those devices directly. It is just that genpd is the most
>> >> convenient location to solve this in current code and in a simple way.
>> >> In other words, I see this solution as a reasonable way to get the
>> >> problem solved quickly for now, so that we can start thinking about a
>> >> more elegant solution.
>> >>
>> >> >> > From a high-level, the IOMMU is just another device inside the PM
>> >> >> > domain, so ideally it should be doing it's own _get() and _put()
>> >> >> > calls so the PM domain code would just do the right thing without
>> >> >> > the need for notifiers.
>> >> >>
>> >> >> As I understand it, the IOMMU (or for other similar cases) shouldn't
>> >> >> be doing any get() and put() at all because there are no IO API to
>> >> >> serve request from.
>> >
>> > Speaking purely from an IOMMU point of view that's not entirely tree.
>> > IOMMU drivers expose map and unmap operations, so they can track whether
>> > any memory is mapped. From a bus master point of view the IOMMU map and
>> > unmap operations are hidden by the DMA mapping API. The IOMMU can thus
>> > track the existence of mappings without any IOMMU awareness in the bus
>> > master driver.
>> >
>> > If no mapping exist the IOMMU shouldn't receive any translation request.
>> > An IOMMU driver could thus call pm_runtime_get_sync() in the map handler
>> > when creating the first mapping, and pm_runtime_put() in the unmap handler
>> > when tearing the last mapping down.
>> >
>> > One could argue that the IOMMU would end up being powered more often than
>> > strictly needed, as bus masters drivers, even when written properly, could
>> > keep mappings around at times they don't perform bus access. This is true,
>> > and that's an argument I've raised during the last kernel summit. The
>> > general response (including Linus Torvald's) was that micro-optimizing
>> > power management might not be worth it, and that measurements proving
>> > that the gain is worth it are required before introducing new APIs to
>> > solve the problem. I can't disagree with that argument.
>>
>> This would be a micro optimization if the IOMMU was located in its own
>> power domain. Unfortunately in most cases it is not, so keeping all
>> the devices in the domain powered on, because one of them have a
>> mapping created doesn't sound like a good idea.
>>
>> Moreover, most of the drivers will keep the mapping for much longer
>> than one run cycle. Please take a look at V4L2's videobuf2 subsystem
>> (which I guess you are more familiar with than me;)), which will keep
>> MMAP buffers mapped in IOMMU address space for their whole lifetime. I
>> believe similar is the case for DRM drivers.
>
> Yes, but that doesn't mean it gets out of control. Buffers shouldn't be
> allocated if they won't be used. Granted, they could be preallocated (or
> rather premapped) slightly before being used, but in sane use cases that
> shouldn't be long before the hardware needs to be turned on.

Assuming that we don't have a third party, called "user", involved.

A simple use case is video playback pause. Usually all the software
state (including output buffers that can be used as reference for
decoding next frames) needs to be preserved to continue decoding after
resume, but it would be nice to power off the decoder, if it is unused
for some period. In addition, we would like the pause/resume operation
to be fast, so unmapping/freeing buffers and then exactly the opposite
on resume doesn't sound like a good idea.

Best regards,
Tomasz
Laurent Pinchart Dec. 17, 2014, 12:15 a.m. UTC | #16
Hi Tomasz,

On Tuesday 16 December 2014 11:18:33 Tomasz Figa wrote:
> On Tue, Dec 16, 2014 at 4:53 AM, Laurent Pinchart wrote:
> > On Monday 15 December 2014 11:39:01 Tomasz Figa wrote:
> >> On Sat, Dec 13, 2014 at 5:47 AM, Laurent Pinchart wrote:
> >>> On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
> >>>> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
> >>>>> On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
> >>>>>> On 11 December 2014 at 16:31, Kevin Hilman wrote:
> >>>>>>> [+ Laurent Pinchart]
> >>>>>>> 
> >>>>>>> Tomasz Figa <tfiga@chromium.org> writes:
> >>>>>>>> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson wrote:
> >>>>>>> [...]
> >>>>>>> 
> >>>>>>>>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct
> >>>>>>>>>> platform_device *pdev)
> >>>>>>>>>>                 return -ENXIO;
> >>>>>>>>>>         }
> >>>>>>>>>> 
> >>>>>>>>>> +       pm_runtime_no_callbacks(dev);
> >>>>>>>>>> +       pm_runtime_enable(dev);
> >>>>>>>>>> +
> >>>>>>>>>> +       /* Synchronize state of the domain with driver data. */
> >>>>>>>>>> +       pm_runtime_get_sync(dev);
> >>>>>>>>>> +       iommu->is_powered = true;
> >>>>>>>>> 
> >>>>>>>>> Doesn't the runtime PM status reflect the value of "is_powered",
> >>>>>>>>> thus why do you need to have a copy of it? Could it perpahps be
> >>>>>>>>> that you try to cope with the case when CONFIG_PM is unset?
> >>>>>>>> 
> >>>>>>>> It's worth noting that this driver fully relies on status of other
> >>>>>>>> devices in the power domain the IOMMU is in and does not enforce
> >>>>>>>> the status on its own. So in general, as far as my understanding
> >>>>>>>> of PM runtime subsystem, the status of the IOMMU device will be
> >>>>>>>> always suspended, because nobody will call pm_runtime_get() on it
> >>>>>>>> (except the get and put pair in probe). So is_powered is here to
> >>>>>>>> track status of the domain, not the device. Feel free to suggest a
> >>>>>>>> better way, though.
> >>>>>>> 
> >>>>>>> I still don't like these notifiers.  I think they add ways to
> >>>>>>> bypass having proper runtime PM implemented for devices/subsystems.
> >>>>>> 
> >>>>>> I do agree, but I haven't found another good solution to the
> >>>>>> problem.
> >>>>> 
> >>>>> For the record, I'm not liking this mostly because it "fixes" a
> >>>>> generic problem in a way that's hidden in the genpd code and very
> >>>>> indirect.
> >>>> 
> >>>> Well, that's true. This is indeed a generic problem of PM dependencies
> >>>> between devices (other than those represented by parent-child
> >>>> relation), which in fact doesn't have much to do with genpd, but
> >>>> rather with those devices directly. It is just that genpd is the most
> >>>> convenient location to solve this in current code and in a simple way.
> >>>> In other words, I see this solution as a reasonable way to get the
> >>>> problem solved quickly for now, so that we can start thinking about a
> >>>> more elegant solution.
> >>>> 
> >>>>>>> From a high-level, the IOMMU is just another device inside the PM
> >>>>>>> domain, so ideally it should be doing it's own _get() and _put()
> >>>>>>> calls so the PM domain code would just do the right thing without
> >>>>>>> the need for notifiers.
> >>>>>> 
> >>>>>> As I understand it, the IOMMU (or for other similar cases)
> >>>>>> shouldn't be doing any get() and put() at all because there are no
> >>>>>> IO API to serve request from.
> >>> 
> >>> Speaking purely from an IOMMU point of view that's not entirely tree.
> >>> IOMMU drivers expose map and unmap operations, so they can track
> >>> whether any memory is mapped. From a bus master point of view the IOMMU
> >>> map and unmap operations are hidden by the DMA mapping API. The IOMMU
> >>> can thus track the existence of mappings without any IOMMU awareness in
> >>> the bus master driver.
> >>> 
> >>> If no mapping exist the IOMMU shouldn't receive any translation
> >>> request. An IOMMU driver could thus call pm_runtime_get_sync() in the
> >>> map handler when creating the first mapping, and pm_runtime_put() in
> >>> the unmap handler when tearing the last mapping down.
> >>> 
> >>> One could argue that the IOMMU would end up being powered more often
> >>> than strictly needed, as bus masters drivers, even when written
> >>> properly, could keep mappings around at times they don't perform bus
> >>> access. This is true, and that's an argument I've raised during the
> >>> last kernel summit. The general response (including Linus Torvald's)
> >>> was that micro-optimizing power management might not be worth it, and
> >>> that measurements proving that the gain is worth it are required before
> >>> introducing new APIs to solve the problem. I can't disagree with that
> >>> argument.
> >> 
> >> This would be a micro optimization if the IOMMU was located in its own
> >> power domain. Unfortunately in most cases it is not, so keeping all
> >> the devices in the domain powered on, because one of them have a
> >> mapping created doesn't sound like a good idea.
> >> 
> >> Moreover, most of the drivers will keep the mapping for much longer
> >> than one run cycle. Please take a look at V4L2's videobuf2 subsystem
> >> (which I guess you are more familiar with than me;)), which will keep
> >> MMAP buffers mapped in IOMMU address space for their whole lifetime. I
> >> believe similar is the case for DRM drivers.
> > 
> > Yes, but that doesn't mean it gets out of control. Buffers shouldn't be
> > allocated if they won't be used. Granted, they could be preallocated (or
> > rather premapped) slightly before being used, but in sane use cases that
> > shouldn't be long before the hardware needs to be turned on.
> 
> Assuming that we don't have a third party, called "user", involved.

Who needs that ? :-D

> A simple use case is video playback pause. Usually all the software state
> (including output buffers that can be used as reference for decoding next
> frames) needs to be preserved to continue decoding after resume, but it
> would be nice to power off the decoder, if it is unused for some period. In
> addition, we would like the pause/resume operation to be fast, so
> unmapping/freeing buffers and then exactly the opposite on resume doesn't
> sound like a good idea.

OK, then we have one possible use case. I expect people to still want to see 
power consumption numbers though.

You can call me annoying, but I'm not sure whether a generic PM dependency 
implementation, while it could be a good idea in general, is the best solution 
here, especially if the bus master and the IOMMU are in a different power 
domain. The bus master could provide functions that don't require DMA access. 
For instance a camera controller could feed its output to the display 
directly, without going through memory. In that case we probably don't want to 
power the IOMMU and its complete power domain on when using the camera 
controller in that mode.

One alternative solution would be to extend the DMA mapping API with two 
functions to signal that DMA is about to be started and that DMA has now 
finished. It might be considered too ad-hoc though.
Rafael J. Wysocki Dec. 18, 2014, 1:32 a.m. UTC | #17
On Wednesday, December 17, 2014 02:15:31 AM Laurent Pinchart wrote:
> Hi Tomasz,
> 
> On Tuesday 16 December 2014 11:18:33 Tomasz Figa wrote:
> > On Tue, Dec 16, 2014 at 4:53 AM, Laurent Pinchart wrote:
> > > On Monday 15 December 2014 11:39:01 Tomasz Figa wrote:
> > >> On Sat, Dec 13, 2014 at 5:47 AM, Laurent Pinchart wrote:
> > >>> On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
> > >>>> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
> > >>>>> On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
> > >>>>>> On 11 December 2014 at 16:31, Kevin Hilman wrote:
> > >>>>>>> [+ Laurent Pinchart]
> > >>>>>>> 
> > >>>>>>> Tomasz Figa <tfiga@chromium.org> writes:
> > >>>>>>>> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson wrote:
> > >>>>>>> [...]
> > >>>>>>> 
> > >>>>>>>>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct
> > >>>>>>>>>> platform_device *pdev)
> > >>>>>>>>>>                 return -ENXIO;
> > >>>>>>>>>>         }
> > >>>>>>>>>> 
> > >>>>>>>>>> +       pm_runtime_no_callbacks(dev);
> > >>>>>>>>>> +       pm_runtime_enable(dev);
> > >>>>>>>>>> +
> > >>>>>>>>>> +       /* Synchronize state of the domain with driver data. */
> > >>>>>>>>>> +       pm_runtime_get_sync(dev);
> > >>>>>>>>>> +       iommu->is_powered = true;
> > >>>>>>>>> 
> > >>>>>>>>> Doesn't the runtime PM status reflect the value of "is_powered",
> > >>>>>>>>> thus why do you need to have a copy of it? Could it perpahps be
> > >>>>>>>>> that you try to cope with the case when CONFIG_PM is unset?
> > >>>>>>>> 
> > >>>>>>>> It's worth noting that this driver fully relies on status of other
> > >>>>>>>> devices in the power domain the IOMMU is in and does not enforce
> > >>>>>>>> the status on its own. So in general, as far as my understanding
> > >>>>>>>> of PM runtime subsystem, the status of the IOMMU device will be
> > >>>>>>>> always suspended, because nobody will call pm_runtime_get() on it
> > >>>>>>>> (except the get and put pair in probe). So is_powered is here to
> > >>>>>>>> track status of the domain, not the device. Feel free to suggest a
> > >>>>>>>> better way, though.
> > >>>>>>> 
> > >>>>>>> I still don't like these notifiers.  I think they add ways to
> > >>>>>>> bypass having proper runtime PM implemented for devices/subsystems.
> > >>>>>> 
> > >>>>>> I do agree, but I haven't found another good solution to the
> > >>>>>> problem.
> > >>>>> 
> > >>>>> For the record, I'm not liking this mostly because it "fixes" a
> > >>>>> generic problem in a way that's hidden in the genpd code and very
> > >>>>> indirect.
> > >>>> 
> > >>>> Well, that's true. This is indeed a generic problem of PM dependencies
> > >>>> between devices (other than those represented by parent-child
> > >>>> relation), which in fact doesn't have much to do with genpd, but
> > >>>> rather with those devices directly. It is just that genpd is the most
> > >>>> convenient location to solve this in current code and in a simple way.
> > >>>> In other words, I see this solution as a reasonable way to get the
> > >>>> problem solved quickly for now, so that we can start thinking about a
> > >>>> more elegant solution.
> > >>>> 
> > >>>>>>> From a high-level, the IOMMU is just another device inside the PM
> > >>>>>>> domain, so ideally it should be doing it's own _get() and _put()
> > >>>>>>> calls so the PM domain code would just do the right thing without
> > >>>>>>> the need for notifiers.
> > >>>>>> 
> > >>>>>> As I understand it, the IOMMU (or for other similar cases)
> > >>>>>> shouldn't be doing any get() and put() at all because there are no
> > >>>>>> IO API to serve request from.
> > >>> 
> > >>> Speaking purely from an IOMMU point of view that's not entirely tree.
> > >>> IOMMU drivers expose map and unmap operations, so they can track
> > >>> whether any memory is mapped. From a bus master point of view the IOMMU
> > >>> map and unmap operations are hidden by the DMA mapping API. The IOMMU
> > >>> can thus track the existence of mappings without any IOMMU awareness in
> > >>> the bus master driver.
> > >>> 
> > >>> If no mapping exist the IOMMU shouldn't receive any translation
> > >>> request. An IOMMU driver could thus call pm_runtime_get_sync() in the
> > >>> map handler when creating the first mapping, and pm_runtime_put() in
> > >>> the unmap handler when tearing the last mapping down.
> > >>> 
> > >>> One could argue that the IOMMU would end up being powered more often
> > >>> than strictly needed, as bus masters drivers, even when written
> > >>> properly, could keep mappings around at times they don't perform bus
> > >>> access. This is true, and that's an argument I've raised during the
> > >>> last kernel summit. The general response (including Linus Torvald's)
> > >>> was that micro-optimizing power management might not be worth it, and
> > >>> that measurements proving that the gain is worth it are required before
> > >>> introducing new APIs to solve the problem. I can't disagree with that
> > >>> argument.
> > >> 
> > >> This would be a micro optimization if the IOMMU was located in its own
> > >> power domain. Unfortunately in most cases it is not, so keeping all
> > >> the devices in the domain powered on, because one of them have a
> > >> mapping created doesn't sound like a good idea.
> > >> 
> > >> Moreover, most of the drivers will keep the mapping for much longer
> > >> than one run cycle. Please take a look at V4L2's videobuf2 subsystem
> > >> (which I guess you are more familiar with than me;)), which will keep
> > >> MMAP buffers mapped in IOMMU address space for their whole lifetime. I
> > >> believe similar is the case for DRM drivers.
> > > 
> > > Yes, but that doesn't mean it gets out of control. Buffers shouldn't be
> > > allocated if they won't be used. Granted, they could be preallocated (or
> > > rather premapped) slightly before being used, but in sane use cases that
> > > shouldn't be long before the hardware needs to be turned on.
> > 
> > Assuming that we don't have a third party, called "user", involved.
> 
> Who needs that ? :-D
> 
> > A simple use case is video playback pause. Usually all the software state
> > (including output buffers that can be used as reference for decoding next
> > frames) needs to be preserved to continue decoding after resume, but it
> > would be nice to power off the decoder, if it is unused for some period. In
> > addition, we would like the pause/resume operation to be fast, so
> > unmapping/freeing buffers and then exactly the opposite on resume doesn't
> > sound like a good idea.
> 
> OK, then we have one possible use case. I expect people to still want to see 
> power consumption numbers though.

Well, we have them, kind of.

In the ACPI world there's something called _DEP which gives us a list of devices
depended on by the given one.  Those may be devices whose drivers provide so
called "operation region" handling which means that an ACPI method executed for
the dependent device may access a device it depends on indirectly.  Because of
that indirection we basically need the devices listed by _DEP to be "on" whenever
the dependent device is "on" or things may break in nasty ways otherwise.

Now, on (some) Intel SoCs some devices listed by _DEP cannot be "on" all the
time, because the lowest-power states of the whole SoC cannot be used then,
which makes hours of battery life of a difference.

This isn't exactly the same problem, but it maps to the IOMMU one quite well IMO.

> You can call me annoying, but I'm not sure whether a generic PM dependency 
> implementation, while it could be a good idea in general, is the best solution 
> here, especially if the bus master and the IOMMU are in a different power 
> domain. The bus master could provide functions that don't require DMA access. 
> For instance a camera controller could feed its output to the display 
> directly, without going through memory. In that case we probably don't want to 
> power the IOMMU and its complete power domain on when using the camera 
> controller in that mode.

That's a fair point, but it really boils down to energy usage numbers again.

> One alternative solution would be to extend the DMA mapping API with two 
> functions to signal that DMA is about to be started and that DMA has now 
> finished. It might be considered too ad-hoc though.

It would be better to be able to reference count the DMA engine from the
bus master IMO and arguably you can use the runtime PM framework for that.
Namely, give bus masters someting like

	pm_runtime_get_my_DMA_engine(bus_master_device)
	pm_runtime_put_my_DMA_engine(bus_master_device)

and let them call these as they see fit.
Laurent Pinchart Dec. 18, 2014, 7:12 p.m. UTC | #18
Hi Rafael,

On Thursday 18 December 2014 02:32:30 Rafael J. Wysocki wrote:
> On Wednesday, December 17, 2014 02:15:31 AM Laurent Pinchart wrote:
> > On Tuesday 16 December 2014 11:18:33 Tomasz Figa wrote:
> >> On Tue, Dec 16, 2014 at 4:53 AM, Laurent Pinchart wrote:
> >>> On Monday 15 December 2014 11:39:01 Tomasz Figa wrote:
> >>>> On Sat, Dec 13, 2014 at 5:47 AM, Laurent Pinchart wrote:
> >>>>> On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
> >>>>>> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
> >>>>>>> On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
> >>>>>>>> On 11 December 2014 at 16:31, Kevin Hilman wrote:
> >>>>>>>>> Tomasz Figa <tfiga@chromium.org> writes:
> >>>>>>>>>> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson wrote:
> >>>>>>>>> [...]
> >>>>>>>>> 
> >>>>>>>>>>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct
> >>>>>>>>>>>> platform_device *pdev)
> >>>>>>>>>>>>                 return -ENXIO;
> >>>>>>>>>>>>         }
> >>>>>>>>>>>> 
> >>>>>>>>>>>> +       pm_runtime_no_callbacks(dev);
> >>>>>>>>>>>> +       pm_runtime_enable(dev);
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +       /* Synchronize state of the domain with driver data.
> >>>>>>>>>>>> */
> >>>>>>>>>>>> +       pm_runtime_get_sync(dev);
> >>>>>>>>>>>> +       iommu->is_powered = true;
> >>>>>>>>>>> 
> >>>>>>>>>>> Doesn't the runtime PM status reflect the value of
> >>>>>>>>>>> "is_powered", thus why do you need to have a copy of it? Could
> >>>>>>>>>>> it perpahps be that you try to cope with the case when
> >>>>>>>>>>> CONFIG_PM is unset?
> >>>>>>>>>> 
> >>>>>>>>>> It's worth noting that this driver fully relies on status of
> >>>>>>>>>> other devices in the power domain the IOMMU is in and does not
> >>>>>>>>>> enforce the status on its own. So in general, as far as my
> >>>>>>>>>> understanding of PM runtime subsystem, the status of the IOMMU
> >>>>>>>>>> device will be always suspended, because nobody will call
> >>>>>>>>>> pm_runtime_get() on it (except the get and put pair in probe).
> >>>>>>>>>> So is_powered is here to track status of the domain, not the
> >>>>>>>>>> device. Feel free to suggest a better way, though.
> >>>>>>>>> 
> >>>>>>>>> I still don't like these notifiers.  I think they add ways to
> >>>>>>>>> bypass having proper runtime PM implemented for
> >>>>>>>>> devices/subsystems.
> >>>>>>>> 
> >>>>>>>> I do agree, but I haven't found another good solution to the
> >>>>>>>> problem.
> >>>>>>> 
> >>>>>>> For the record, I'm not liking this mostly because it "fixes" a
> >>>>>>> generic problem in a way that's hidden in the genpd code and very
> >>>>>>> indirect.
> >>>>>> 
> >>>>>> Well, that's true. This is indeed a generic problem of PM
> >>>>>> dependencies between devices (other than those represented by
> >>>>>> parent-child relation), which in fact doesn't have much to do with
> >>>>>> genpd, but rather with those devices directly. It is just that
> >>>>>> genpd is the most convenient location to solve this in current code
> >>>>>> and in a simple way. In other words, I see this solution as a
> >>>>>> reasonable way to get the problem solved quickly for now, so that
> >>>>>> we can start thinking about a more elegant solution.
> >>>>>> 
> >>>>>>>>> From a high-level, the IOMMU is just another device inside the
> >>>>>>>>> PM domain, so ideally it should be doing it's own _get() and
> >>>>>>>>> _put() calls so the PM domain code would just do the right thing
> >>>>>>>>> without the need for notifiers.
> >>>>>>>> 
> >>>>>>>> As I understand it, the IOMMU (or for other similar cases)
> >>>>>>>> shouldn't be doing any get() and put() at all because there are
> >>>>>>>> no IO API to serve request from.
> >>>>> 
> >>>>> Speaking purely from an IOMMU point of view that's not entirely
> >>>>> tree. IOMMU drivers expose map and unmap operations, so they can
> >>>>> track whether any memory is mapped. From a bus master point of view
> >>>>> the IOMMU map and unmap operations are hidden by the DMA mapping
> >>>>> API. The IOMMU can thus track the existence of mappings without any
> >>>>> IOMMU awareness in the bus master driver.
> >>>>> 
> >>>>> If no mapping exist the IOMMU shouldn't receive any translation
> >>>>> request. An IOMMU driver could thus call pm_runtime_get_sync() in
> >>>>> the map handler when creating the first mapping, and
> >>>>> pm_runtime_put() in the unmap handler when tearing the last mapping
> >>>>> down.
> >>>>> 
> >>>>> One could argue that the IOMMU would end up being powered more often
> >>>>> than strictly needed, as bus masters drivers, even when written
> >>>>> properly, could keep mappings around at times they don't perform bus
> >>>>> access. This is true, and that's an argument I've raised during the
> >>>>> last kernel summit. The general response (including Linus Torvald's)
> >>>>> was that micro-optimizing power management might not be worth it,
> >>>>> and that measurements proving that the gain is worth it are required
> >>>>> before introducing new APIs to solve the problem. I can't disagree
> >>>>> with that argument.
> >>>> 
> >>>> This would be a micro optimization if the IOMMU was located in its
> >>>> own power domain. Unfortunately in most cases it is not, so keeping
> >>>> all the devices in the domain powered on, because one of them have a
> >>>> mapping created doesn't sound like a good idea.
> >>>> 
> >>>> Moreover, most of the drivers will keep the mapping for much longer
> >>>> than one run cycle. Please take a look at V4L2's videobuf2 subsystem
> >>>> (which I guess you are more familiar with than me;)), which will keep
> >>>> MMAP buffers mapped in IOMMU address space for their whole lifetime.
> >>>> I believe similar is the case for DRM drivers.
> >>> 
> >>> Yes, but that doesn't mean it gets out of control. Buffers shouldn't
> >>> be allocated if they won't be used. Granted, they could be
> >>> preallocated (or rather premapped) slightly before being used, but in
> >>> sane use cases that shouldn't be long before the hardware needs to be
> >>> turned on.
> >> 
> >> Assuming that we don't have a third party, called "user", involved.
> > 
> > Who needs that ? :-D
> > 
> >> A simple use case is video playback pause. Usually all the software
> >> state (including output buffers that can be used as reference for
> >> decoding next frames) needs to be preserved to continue decoding after
> >> resume, but it would be nice to power off the decoder, if it is unused
> >> for some period. In addition, we would like the pause/resume operation
> >> to be fast, so unmapping/freeing buffers and then exactly the opposite
> >> on resume doesn't sound like a good idea.
> > 
> > OK, then we have one possible use case. I expect people to still want to
> > see power consumption numbers though.
> 
> Well, we have them, kind of.
> 
> In the ACPI world there's something called _DEP which gives us a list of
> devices depended on by the given one.  Those may be devices whose drivers
> provide so called "operation region" handling which means that an ACPI
> method executed for the dependent device may access a device it depends on
> indirectly.  Because of that indirection we basically need the devices
> listed by _DEP to be "on" whenever the dependent device is "on" or things
> may break in nasty ways otherwise.
> 
> Now, on (some) Intel SoCs some devices listed by _DEP cannot be "on" all the
> time, because the lowest-power states of the whole SoC cannot be used then,
> which makes hours of battery life of a difference.
> 
> This isn't exactly the same problem, but it maps to the IOMMU one quite well
> IMO.

Agreed, that's certainly a use case for a power dependency implementation.

> > You can call me annoying, but I'm not sure whether a generic PM dependency
> > implementation, while it could be a good idea in general, is the best
> > solution here, especially if the bus master and the IOMMU are in a
> > different power domain. The bus master could provide functions that don't
> > require DMA access. For instance a camera controller could feed its
> > output to the display directly, without going through memory. In that
> > case we probably don't want to power the IOMMU and its complete power
> > domain on when using the camera controller in that mode.
> 
> That's a fair point, but it really boils down to energy usage numbers again.
>
> > One alternative solution would be to extend the DMA mapping API with two
> > functions to signal that DMA is about to be started and that DMA has now
> > finished. It might be considered too ad-hoc though.
> 
> It would be better to be able to reference count the DMA engine from the
> bus master IMO and arguably you can use the runtime PM framework for that.
> Namely, give bus masters someting like
> 
> 	pm_runtime_get_my_DMA_engine(bus_master_device)
> 	pm_runtime_put_my_DMA_engine(bus_master_device)
> 
> and let them call these as they see fit.

Please note that we're not talking about DMA engines here, but about IOMMUs. 
DMA is involved through the DMA mapping API which hides the IOMMU completely 
from the bus master drivers, not the DMA engine API.

Exposing the IOMMU is something we want to avoid, but DMA mapping start/stop 
operations could certainly be implemented.
Kevin Hilman Dec. 18, 2014, 9:14 p.m. UTC | #19
Laurent Pinchart <laurent.pinchart@ideasonboard.com> writes:

> Hi Rafael,
>
> On Thursday 18 December 2014 02:32:30 Rafael J. Wysocki wrote:
>> On Wednesday, December 17, 2014 02:15:31 AM Laurent Pinchart wrote:
>> > On Tuesday 16 December 2014 11:18:33 Tomasz Figa wrote:
>> >> On Tue, Dec 16, 2014 at 4:53 AM, Laurent Pinchart wrote:
>> >>> On Monday 15 December 2014 11:39:01 Tomasz Figa wrote:
>> >>>> On Sat, Dec 13, 2014 at 5:47 AM, Laurent Pinchart wrote:
>> >>>>> On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
>> >>>>>> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
>> >>>>>>> On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
>> >>>>>>>> On 11 December 2014 at 16:31, Kevin Hilman wrote:
>> >>>>>>>>> Tomasz Figa <tfiga@chromium.org> writes:
>> >>>>>>>>>> On Thu, Dec 11, 2014 at 8:58 PM, Ulf Hansson wrote:
>> >>>>>>>>> [...]
>> >>>>>>>>> 
>> >>>>>>>>>>>> @@ -988,11 +1107,28 @@ static int rk_iommu_probe(struct
>> >>>>>>>>>>>> platform_device *pdev)
>> >>>>>>>>>>>>                 return -ENXIO;
>> >>>>>>>>>>>>         }
>> >>>>>>>>>>>> 
>> >>>>>>>>>>>> +       pm_runtime_no_callbacks(dev);
>> >>>>>>>>>>>> +       pm_runtime_enable(dev);
>> >>>>>>>>>>>> +
>> >>>>>>>>>>>> +       /* Synchronize state of the domain with driver data.
>> >>>>>>>>>>>> */
>> >>>>>>>>>>>> +       pm_runtime_get_sync(dev);
>> >>>>>>>>>>>> +       iommu->is_powered = true;
>> >>>>>>>>>>> 
>> >>>>>>>>>>> Doesn't the runtime PM status reflect the value of
>> >>>>>>>>>>> "is_powered", thus why do you need to have a copy of it? Could
>> >>>>>>>>>>> it perpahps be that you try to cope with the case when
>> >>>>>>>>>>> CONFIG_PM is unset?
>> >>>>>>>>>> 
>> >>>>>>>>>> It's worth noting that this driver fully relies on status of
>> >>>>>>>>>> other devices in the power domain the IOMMU is in and does not
>> >>>>>>>>>> enforce the status on its own. So in general, as far as my
>> >>>>>>>>>> understanding of PM runtime subsystem, the status of the IOMMU
>> >>>>>>>>>> device will be always suspended, because nobody will call
>> >>>>>>>>>> pm_runtime_get() on it (except the get and put pair in probe).
>> >>>>>>>>>> So is_powered is here to track status of the domain, not the
>> >>>>>>>>>> device. Feel free to suggest a better way, though.
>> >>>>>>>>> 
>> >>>>>>>>> I still don't like these notifiers.  I think they add ways to
>> >>>>>>>>> bypass having proper runtime PM implemented for
>> >>>>>>>>> devices/subsystems.
>> >>>>>>>> 
>> >>>>>>>> I do agree, but I haven't found another good solution to the
>> >>>>>>>> problem.
>> >>>>>>> 
>> >>>>>>> For the record, I'm not liking this mostly because it "fixes" a
>> >>>>>>> generic problem in a way that's hidden in the genpd code and very
>> >>>>>>> indirect.
>> >>>>>> 
>> >>>>>> Well, that's true. This is indeed a generic problem of PM
>> >>>>>> dependencies between devices (other than those represented by
>> >>>>>> parent-child relation), which in fact doesn't have much to do with
>> >>>>>> genpd, but rather with those devices directly. It is just that
>> >>>>>> genpd is the most convenient location to solve this in current code
>> >>>>>> and in a simple way. In other words, I see this solution as a
>> >>>>>> reasonable way to get the problem solved quickly for now, so that
>> >>>>>> we can start thinking about a more elegant solution.
>> >>>>>> 
>> >>>>>>>>> From a high-level, the IOMMU is just another device inside the
>> >>>>>>>>> PM domain, so ideally it should be doing it's own _get() and
>> >>>>>>>>> _put() calls so the PM domain code would just do the right thing
>> >>>>>>>>> without the need for notifiers.
>> >>>>>>>> 
>> >>>>>>>> As I understand it, the IOMMU (or for other similar cases)
>> >>>>>>>> shouldn't be doing any get() and put() at all because there are
>> >>>>>>>> no IO API to serve request from.
>> >>>>> 
>> >>>>> Speaking purely from an IOMMU point of view that's not entirely
>> >>>>> tree. IOMMU drivers expose map and unmap operations, so they can
>> >>>>> track whether any memory is mapped. From a bus master point of view
>> >>>>> the IOMMU map and unmap operations are hidden by the DMA mapping
>> >>>>> API. The IOMMU can thus track the existence of mappings without any
>> >>>>> IOMMU awareness in the bus master driver.
>> >>>>> 
>> >>>>> If no mapping exist the IOMMU shouldn't receive any translation
>> >>>>> request. An IOMMU driver could thus call pm_runtime_get_sync() in
>> >>>>> the map handler when creating the first mapping, and
>> >>>>> pm_runtime_put() in the unmap handler when tearing the last mapping
>> >>>>> down.
>> >>>>> 
>> >>>>> One could argue that the IOMMU would end up being powered more often
>> >>>>> than strictly needed, as bus masters drivers, even when written
>> >>>>> properly, could keep mappings around at times they don't perform bus
>> >>>>> access. This is true, and that's an argument I've raised during the
>> >>>>> last kernel summit. The general response (including Linus Torvald's)
>> >>>>> was that micro-optimizing power management might not be worth it,
>> >>>>> and that measurements proving that the gain is worth it are required
>> >>>>> before introducing new APIs to solve the problem. I can't disagree
>> >>>>> with that argument.
>> >>>> 
>> >>>> This would be a micro optimization if the IOMMU was located in its
>> >>>> own power domain. Unfortunately in most cases it is not, so keeping
>> >>>> all the devices in the domain powered on, because one of them have a
>> >>>> mapping created doesn't sound like a good idea.
>> >>>> 
>> >>>> Moreover, most of the drivers will keep the mapping for much longer
>> >>>> than one run cycle. Please take a look at V4L2's videobuf2 subsystem
>> >>>> (which I guess you are more familiar with than me;)), which will keep
>> >>>> MMAP buffers mapped in IOMMU address space for their whole lifetime.
>> >>>> I believe similar is the case for DRM drivers.
>> >>> 
>> >>> Yes, but that doesn't mean it gets out of control. Buffers shouldn't
>> >>> be allocated if they won't be used. Granted, they could be
>> >>> preallocated (or rather premapped) slightly before being used, but in
>> >>> sane use cases that shouldn't be long before the hardware needs to be
>> >>> turned on.
>> >> 
>> >> Assuming that we don't have a third party, called "user", involved.
>> > 
>> > Who needs that ? :-D
>> > 
>> >> A simple use case is video playback pause. Usually all the software
>> >> state (including output buffers that can be used as reference for
>> >> decoding next frames) needs to be preserved to continue decoding after
>> >> resume, but it would be nice to power off the decoder, if it is unused
>> >> for some period. In addition, we would like the pause/resume operation
>> >> to be fast, so unmapping/freeing buffers and then exactly the opposite
>> >> on resume doesn't sound like a good idea.
>> > 
>> > OK, then we have one possible use case. I expect people to still want to
>> > see power consumption numbers though.
>> 
>> Well, we have them, kind of.
>> 
>> In the ACPI world there's something called _DEP which gives us a list of
>> devices depended on by the given one.  Those may be devices whose drivers
>> provide so called "operation region" handling which means that an ACPI
>> method executed for the dependent device may access a device it depends on
>> indirectly.  Because of that indirection we basically need the devices
>> listed by _DEP to be "on" whenever the dependent device is "on" or things
>> may break in nasty ways otherwise.
>> 
>> Now, on (some) Intel SoCs some devices listed by _DEP cannot be "on" all the
>> time, because the lowest-power states of the whole SoC cannot be used then,
>> which makes hours of battery life of a difference.
>> 
>> This isn't exactly the same problem, but it maps to the IOMMU one quite well
>> IMO.
>
> Agreed, that's certainly a use case for a power dependency implementation.
>
>> > You can call me annoying, but I'm not sure whether a generic PM dependency
>> > implementation, while it could be a good idea in general, is the best
>> > solution here, especially if the bus master and the IOMMU are in a
>> > different power domain. The bus master could provide functions that don't
>> > require DMA access. For instance a camera controller could feed its
>> > output to the display directly, without going through memory. In that
>> > case we probably don't want to power the IOMMU and its complete power
>> > domain on when using the camera controller in that mode.
>> 
>> That's a fair point, but it really boils down to energy usage numbers again.
>>
>> > One alternative solution would be to extend the DMA mapping API with two
>> > functions to signal that DMA is about to be started and that DMA has now
>> > finished. It might be considered too ad-hoc though.
>> 
>> It would be better to be able to reference count the DMA engine from the
>> bus master IMO and arguably you can use the runtime PM framework for that.
>> Namely, give bus masters someting like
>> 
>> 	pm_runtime_get_my_DMA_engine(bus_master_device)
>> 	pm_runtime_put_my_DMA_engine(bus_master_device)
>> 
>> and let them call these as they see fit.
>
> Please note that we're not talking about DMA engines here, but about IOMMUs. 
> DMA is involved through the DMA mapping API which hides the IOMMU completely 
> from the bus master drivers, not the DMA engine API.
>
> Exposing the IOMMU is something we want to avoid, but DMA mapping start/stop 
> operations could certainly be implemented.

The problem with that is it only solves the IOMMU problem.  We have a
more generic PM dependency problem of which this IOMMU example is only a
subset, so I think we need a more generic solution.

Kevin
Laurent Pinchart Dec. 18, 2014, 9:28 p.m. UTC | #20
Hi Kevin,

On Thursday 18 December 2014 13:14:24 Kevin Hilman wrote:
> Laurent Pinchart <laurent.pinchart@ideasonboard.com> writes:
> > On Thursday 18 December 2014 02:32:30 Rafael J. Wysocki wrote:
> >> On Wednesday, December 17, 2014 02:15:31 AM Laurent Pinchart wrote:
> >>> On Tuesday 16 December 2014 11:18:33 Tomasz Figa wrote:
> >>>> On Tue, Dec 16, 2014 at 4:53 AM, Laurent Pinchart wrote:
> >>>>> On Monday 15 December 2014 11:39:01 Tomasz Figa wrote:
> >>>>>> On Sat, Dec 13, 2014 at 5:47 AM, Laurent Pinchart wrote:
> >>>>>>> On Friday 12 December 2014 13:15:51 Tomasz Figa wrote:
> >>>>>>>> On Fri, Dec 12, 2014 at 5:48 AM, Rafael J. Wysocki wrote:
> >>>>>>>>> On Thursday, December 11, 2014 04:51:37 PM Ulf Hansson wrote:
> >>>>>>>>>> On 11 December 2014 at 16:31, Kevin Hilman wrote:

[...]

> >>>>>>>>>>> From a high-level, the IOMMU is just another device inside the
> >>>>>>>>>>> PM domain, so ideally it should be doing it's own _get() and
> >>>>>>>>>>> _put() calls so the PM domain code would just do the right
> >>>>>>>>>>> thing without the need for notifiers.
> >>>>>>>>>> 
> >>>>>>>>>> As I understand it, the IOMMU (or for other similar cases)
> >>>>>>>>>> shouldn't be doing any get() and put() at all because there are
> >>>>>>>>>> no IO API to serve request from.
> >>>>>>> 
> >>>>>>> Speaking purely from an IOMMU point of view that's not entirely
> >>>>>>> tree. IOMMU drivers expose map and unmap operations, so they can
> >>>>>>> track whether any memory is mapped. From a bus master point of view
> >>>>>>> the IOMMU map and unmap operations are hidden by the DMA mapping
> >>>>>>> API. The IOMMU can thus track the existence of mappings without any
> >>>>>>> IOMMU awareness in the bus master driver.
> >>>>>>> 
> >>>>>>> If no mapping exist the IOMMU shouldn't receive any translation
> >>>>>>> request. An IOMMU driver could thus call pm_runtime_get_sync() in
> >>>>>>> the map handler when creating the first mapping, and
> >>>>>>> pm_runtime_put() in the unmap handler when tearing the last mapping
> >>>>>>> down.
> >>>>>>> 
> >>>>>>> One could argue that the IOMMU would end up being powered more
> >>>>>>> often than strictly needed, as bus masters drivers, even when
> >>>>>>> written properly, could keep mappings around at times they don't
> >>>>>>> perform bus access. This is true, and that's an argument I've raised
> >>>>>>> during the last kernel summit. The general response (including Linus
> >>>>>>> Torvald's) was that micro-optimizing power management might not be
> >>>>>>> worth it, and that measurements proving that the gain is worth it
> >>>>>>> are required before introducing new APIs to solve the problem. I
> >>>>>>> can't disagree with that argument.
> >>>>>> 
> >>>>>> This would be a micro optimization if the IOMMU was located in its
> >>>>>> own power domain. Unfortunately in most cases it is not, so keeping
> >>>>>> all the devices in the domain powered on, because one of them have a
> >>>>>> mapping created doesn't sound like a good idea.
> >>>>>> 
> >>>>>> Moreover, most of the drivers will keep the mapping for much longer
> >>>>>> than one run cycle. Please take a look at V4L2's videobuf2 subsystem
> >>>>>> (which I guess you are more familiar with than me;)), which will
> >>>>>> keep MMAP buffers mapped in IOMMU address space for their whole
> >>>>>> lifetime. I believe similar is the case for DRM drivers.
> >>>>> 
> >>>>> Yes, but that doesn't mean it gets out of control. Buffers shouldn't
> >>>>> be allocated if they won't be used. Granted, they could be
> >>>>> preallocated (or rather premapped) slightly before being used, but in
> >>>>> sane use cases that shouldn't be long before the hardware needs to be
> >>>>> turned on.
> >>>> 
> >>>> Assuming that we don't have a third party, called "user", involved.
> >>> 
> >>> Who needs that ? :-D
> >>> 
> >>>> A simple use case is video playback pause. Usually all the software
> >>>> state (including output buffers that can be used as reference for
> >>>> decoding next frames) needs to be preserved to continue decoding after
> >>>> resume, but it would be nice to power off the decoder, if it is unused
> >>>> for some period. In addition, we would like the pause/resume operation
> >>>> to be fast, so unmapping/freeing buffers and then exactly the opposite
> >>>> on resume doesn't sound like a good idea.
> >>> 
> >>> OK, then we have one possible use case. I expect people to still want
> >>> to see power consumption numbers though.
> >> 
> >> Well, we have them, kind of.
> >> 
> >> In the ACPI world there's something called _DEP which gives us a list of
> >> devices depended on by the given one.  Those may be devices whose drivers
> >> provide so called "operation region" handling which means that an ACPI
> >> method executed for the dependent device may access a device it depends
> >> on indirectly.  Because of that indirection we basically need the devices
> >> listed by _DEP to be "on" whenever the dependent device is "on" or things
> >> may break in nasty ways otherwise.
> >> 
> >> Now, on (some) Intel SoCs some devices listed by _DEP cannot be "on" all
> >> the time, because the lowest-power states of the whole SoC cannot be
> >> used then, which makes hours of battery life of a difference.
> >> 
> >> This isn't exactly the same problem, but it maps to the IOMMU one quite
> >> well IMO.
> > 
> > Agreed, that's certainly a use case for a power dependency implementation.
> > 
> >>> You can call me annoying, but I'm not sure whether a generic PM
> >>> dependency implementation, while it could be a good idea in general, is
> >>> the best solution here, especially if the bus master and the IOMMU are
> >>> in a different power domain. The bus master could provide functions
> >>> that don't require DMA access. For instance a camera controller could
> >>> feed its output to the display directly, without going through memory.
> >>> In that case we probably don't want to power the IOMMU and its complete
> >>> power domain on when using the camera controller in that mode.
> >> 
> >> That's a fair point, but it really boils down to energy usage numbers
> >> again.
> >>
> >>> One alternative solution would be to extend the DMA mapping API with
> >>> two functions to signal that DMA is about to be started and that DMA has
> >>> now finished. It might be considered too ad-hoc though.
> >> 
> >> It would be better to be able to reference count the DMA engine from the
> >> bus master IMO and arguably you can use the runtime PM framework for
> >> that. Namely, give bus masters someting like
> >> 
> >> 	pm_runtime_get_my_DMA_engine(bus_master_device)
> >> 	pm_runtime_put_my_DMA_engine(bus_master_device)
> >> 
> >> and let them call these as they see fit.
> > 
> > Please note that we're not talking about DMA engines here, but about
> > IOMMUs. DMA is involved through the DMA mapping API which hides the IOMMU
> > completely from the bus master drivers, not the DMA engine API.
> > 
> > Exposing the IOMMU is something we want to avoid, but DMA mapping
> > start/stop operations could certainly be implemented.
> 
> The problem with that is it only solves the IOMMU problem.  We have a
> more generic PM dependency problem of which this IOMMU example is only a
> subset, so I think we need a more generic solution.

I agree that a more generic solution is needed at least to support ACPI _DEP, 
but that might not be optimal in the IOMMU use case as explained above.
Rafael J. Wysocki Dec. 19, 2014, 2:27 a.m. UTC | #21
On Thursday, December 18, 2014 11:28:58 PM Laurent Pinchart wrote:
> Hi Kevin,
> 

[cut]

> > >> 
> > >> It would be better to be able to reference count the DMA engine from the
> > >> bus master IMO and arguably you can use the runtime PM framework for
> > >> that. Namely, give bus masters someting like
> > >> 
> > >> 	pm_runtime_get_my_DMA_engine(bus_master_device)
> > >> 	pm_runtime_put_my_DMA_engine(bus_master_device)
> > >> 
> > >> and let them call these as they see fit.
> > > 
> > > Please note that we're not talking about DMA engines here, but about
> > > IOMMUs. DMA is involved through the DMA mapping API which hides the IOMMU
> > > completely from the bus master drivers, not the DMA engine API.
> > > 
> > > Exposing the IOMMU is something we want to avoid, but DMA mapping
> > > start/stop operations could certainly be implemented.
> > 
> > The problem with that is it only solves the IOMMU problem.  We have a
> > more generic PM dependency problem of which this IOMMU example is only a
> > subset, so I think we need a more generic solution.
> 
> I agree that a more generic solution is needed at least to support ACPI _DEP, 
> but that might not be optimal in the IOMMU use case as explained above.

Well, since we need it anyway, why don't we implement it and then figure out
if anything more specific needs to be done for the IOMMU case?
Laurent Pinchart Dec. 20, 2014, 7:01 p.m. UTC | #22
Hi Kevin,

On Friday 19 December 2014 03:27:35 Rafael J. Wysocki wrote:
> On Thursday, December 18, 2014 11:28:58 PM Laurent Pinchart wrote:
> > Hi Kevin,
> 
> [cut]
> 
>>>>> It would be better to be able to reference count the DMA engine from
>>>>> the bus master IMO and arguably you can use the runtime PM framework
>>>>> for that. Namely, give bus masters someting like
>>>>> 
>>>>> 	pm_runtime_get_my_DMA_engine(bus_master_device)
>>>>> 	pm_runtime_put_my_DMA_engine(bus_master_device)
>>>>> 
>>>>> and let them call these as they see fit.
>>>> 
>>>> Please note that we're not talking about DMA engines here, but about
>>>> IOMMUs. DMA is involved through the DMA mapping API which hides the
>>>> IOMMU completely from the bus master drivers, not the DMA engine API.
>>>> 
>>>> Exposing the IOMMU is something we want to avoid, but DMA mapping
>>>> start/stop operations could certainly be implemented.
>>> 
>>> The problem with that is it only solves the IOMMU problem.  We have a
>>> more generic PM dependency problem of which this IOMMU example is only a
>>> subset, so I think we need a more generic solution.
>> 
>> I agree that a more generic solution is needed at least to support ACPI
>> _DEP, but that might not be optimal in the IOMMU use case as explained
>> above.
>
> Well, since we need it anyway, why don't we implement it and then figure out
> if anything more specific needs to be done for the IOMMU case?

Patches are welcome ;-)
diff mbox

Patch

diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index b2023af..9120655 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -20,6 +20,8 @@ 
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 
@@ -88,6 +90,9 @@  struct rk_iommu {
 	int irq;
 	struct list_head node; /* entry in rk_iommu_domain.iommus */
 	struct iommu_domain *domain; /* domain to which iommu is attached */
+	struct notifier_block genpd_nb;
+	spinlock_t hw_lock; /* lock for register access */
+	bool is_powered; /* power domain is on and register clock is enabled */
 };
 
 static inline void rk_table_flush(u32 *va, unsigned int count)
@@ -283,6 +288,9 @@  static void rk_iommu_zap_lines(struct rk_iommu *iommu, dma_addr_t iova,
 			       size_t size)
 {
 	dma_addr_t iova_end = iova + size;
+
+	assert_spin_locked(&iommu->hw_lock);
+
 	/*
 	 * TODO(djkurtz): Figure out when it is more efficient to shootdown the
 	 * entire iotlb rather than iterate over individual iovas.
@@ -293,11 +301,15 @@  static void rk_iommu_zap_lines(struct rk_iommu *iommu, dma_addr_t iova,
 
 static bool rk_iommu_is_stall_active(struct rk_iommu *iommu)
 {
+	assert_spin_locked(&iommu->hw_lock);
+
 	return rk_iommu_read(iommu, RK_MMU_STATUS) & RK_MMU_STATUS_STALL_ACTIVE;
 }
 
 static bool rk_iommu_is_paging_enabled(struct rk_iommu *iommu)
 {
+	assert_spin_locked(&iommu->hw_lock);
+
 	return rk_iommu_read(iommu, RK_MMU_STATUS) &
 			     RK_MMU_STATUS_PAGING_ENABLED;
 }
@@ -306,6 +318,8 @@  static int rk_iommu_enable_stall(struct rk_iommu *iommu)
 {
 	int ret;
 
+	assert_spin_locked(&iommu->hw_lock);
+
 	if (rk_iommu_is_stall_active(iommu))
 		return 0;
 
@@ -327,6 +341,8 @@  static int rk_iommu_disable_stall(struct rk_iommu *iommu)
 {
 	int ret;
 
+	assert_spin_locked(&iommu->hw_lock);
+
 	if (!rk_iommu_is_stall_active(iommu))
 		return 0;
 
@@ -344,6 +360,8 @@  static int rk_iommu_enable_paging(struct rk_iommu *iommu)
 {
 	int ret;
 
+	assert_spin_locked(&iommu->hw_lock);
+
 	if (rk_iommu_is_paging_enabled(iommu))
 		return 0;
 
@@ -361,6 +379,8 @@  static int rk_iommu_disable_paging(struct rk_iommu *iommu)
 {
 	int ret;
 
+	assert_spin_locked(&iommu->hw_lock);
+
 	if (!rk_iommu_is_paging_enabled(iommu))
 		return 0;
 
@@ -379,6 +399,8 @@  static int rk_iommu_force_reset(struct rk_iommu *iommu)
 	int ret;
 	u32 dte_addr;
 
+	assert_spin_locked(&iommu->hw_lock);
+
 	/*
 	 * Check if register DTE_ADDR is working by writing DTE_ADDR_DUMMY
 	 * and verifying that upper 5 nybbles are read back.
@@ -401,6 +423,50 @@  static int rk_iommu_force_reset(struct rk_iommu *iommu)
 	return ret;
 }
 
+static int rk_iommu_startup(struct rk_iommu *iommu)
+{
+	struct iommu_domain *domain = iommu->domain;
+	struct rk_iommu_domain *rk_domain;
+	phys_addr_t dte_addr;
+	int ret;
+
+	assert_spin_locked(&iommu->hw_lock);
+
+	ret = rk_iommu_enable_stall(iommu);
+	if (ret)
+		return ret;
+
+	ret = rk_iommu_force_reset(iommu);
+	if (ret)
+		return ret;
+
+	rk_domain = domain->priv;
+	dte_addr = virt_to_phys(rk_domain->dt);
+	rk_iommu_write(iommu, RK_MMU_DTE_ADDR, dte_addr);
+	rk_iommu_command(iommu, RK_MMU_CMD_ZAP_CACHE);
+	rk_iommu_write(iommu, RK_MMU_INT_MASK, RK_MMU_IRQ_MASK);
+
+	ret = rk_iommu_enable_paging(iommu);
+	if (ret)
+		return ret;
+
+	rk_iommu_disable_stall(iommu);
+
+	return ret;
+}
+
+static void rk_iommu_shutdown(struct rk_iommu *iommu)
+{
+	assert_spin_locked(&iommu->hw_lock);
+
+	/* Ignore error while disabling, just keep going */
+	rk_iommu_enable_stall(iommu);
+	rk_iommu_disable_paging(iommu);
+	rk_iommu_write(iommu, RK_MMU_INT_MASK, 0);
+	rk_iommu_write(iommu, RK_MMU_DTE_ADDR, 0);
+	rk_iommu_disable_stall(iommu);
+}
+
 static void log_iova(struct rk_iommu *iommu, dma_addr_t iova)
 {
 	u32 dte_index, pte_index, page_offset;
@@ -414,6 +480,8 @@  static void log_iova(struct rk_iommu *iommu, dma_addr_t iova)
 	phys_addr_t page_addr_phys = 0;
 	u32 page_flags = 0;
 
+	assert_spin_locked(&iommu->hw_lock);
+
 	dte_index = rk_iova_dte_index(iova);
 	pte_index = rk_iova_pte_index(iova);
 	page_offset = rk_iova_page_offset(iova);
@@ -450,13 +518,22 @@  print_it:
 static irqreturn_t rk_iommu_irq(int irq, void *dev_id)
 {
 	struct rk_iommu *iommu = dev_id;
+	irqreturn_t ret = IRQ_HANDLED;
+	unsigned long flags;
 	u32 status;
 	u32 int_status;
 	dma_addr_t iova;
 
+	spin_lock_irqsave(&iommu->hw_lock, flags);
+
+	if (!iommu->is_powered)
+		goto done;
+
 	int_status = rk_iommu_read(iommu, RK_MMU_INT_STATUS);
-	if (int_status == 0)
-		return IRQ_NONE;
+	if (int_status == 0) {
+		ret = IRQ_NONE;
+		goto done;
+	}
 
 	iova = rk_iommu_read(iommu, RK_MMU_PAGE_FAULT_ADDR);
 
@@ -497,7 +574,10 @@  static irqreturn_t rk_iommu_irq(int irq, void *dev_id)
 
 	rk_iommu_write(iommu, RK_MMU_INT_CLEAR, int_status);
 
-	return IRQ_HANDLED;
+done:
+	spin_unlock_irqrestore(&iommu->hw_lock, flags);
+
+	return ret;
 }
 
 static phys_addr_t rk_iommu_iova_to_phys(struct iommu_domain *domain,
@@ -537,9 +617,15 @@  static void rk_iommu_zap_iova(struct rk_iommu_domain *rk_domain,
 	/* shootdown these iova from all iommus using this domain */
 	spin_lock_irqsave(&rk_domain->iommus_lock, flags);
 	list_for_each(pos, &rk_domain->iommus) {
-		struct rk_iommu *iommu;
-		iommu = list_entry(pos, struct rk_iommu, node);
-		rk_iommu_zap_lines(iommu, iova, size);
+		struct rk_iommu *iommu = list_entry(pos, struct rk_iommu, node);
+
+		spin_lock(&iommu->hw_lock);
+
+		/* Only zap TLBs of IOMMUs that are powered on. */
+		if (iommu->is_powered)
+			rk_iommu_zap_lines(iommu, iova, size);
+
+		spin_unlock(&iommu->hw_lock);
 	}
 	spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
 }
@@ -729,7 +815,6 @@  static int rk_iommu_attach_device(struct iommu_domain *domain,
 	struct rk_iommu_domain *rk_domain = domain->priv;
 	unsigned long flags;
 	int ret;
-	phys_addr_t dte_addr;
 
 	/*
 	 * Allow 'virtual devices' (e.g., drm) to attach to domain.
@@ -739,37 +824,22 @@  static int rk_iommu_attach_device(struct iommu_domain *domain,
 	if (!iommu)
 		return 0;
 
-	ret = rk_iommu_enable_stall(iommu);
-	if (ret)
-		return ret;
-
-	ret = rk_iommu_force_reset(iommu);
-	if (ret)
-		return ret;
-
-	iommu->domain = domain;
-
 	ret = devm_request_irq(dev, iommu->irq, rk_iommu_irq,
 			       IRQF_SHARED, dev_name(dev), iommu);
 	if (ret)
 		return ret;
 
-	dte_addr = virt_to_phys(rk_domain->dt);
-	rk_iommu_write(iommu, RK_MMU_DTE_ADDR, dte_addr);
-	rk_iommu_command(iommu, RK_MMU_CMD_ZAP_CACHE);
-	rk_iommu_write(iommu, RK_MMU_INT_MASK, RK_MMU_IRQ_MASK);
-
-	ret = rk_iommu_enable_paging(iommu);
-	if (ret)
-		return ret;
-
 	spin_lock_irqsave(&rk_domain->iommus_lock, flags);
 	list_add_tail(&iommu->node, &rk_domain->iommus);
 	spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
 
-	dev_info(dev, "Attached to iommu domain\n");
+	spin_lock_irqsave(&iommu->hw_lock, flags);
+	iommu->domain = domain;
+	if (iommu->is_powered)
+		rk_iommu_startup(iommu);
+	spin_unlock_irqrestore(&iommu->hw_lock, flags);
 
-	rk_iommu_disable_stall(iommu);
+	dev_info(dev, "Attached to iommu domain\n");
 
 	return 0;
 }
@@ -790,17 +860,14 @@  static void rk_iommu_detach_device(struct iommu_domain *domain,
 	list_del_init(&iommu->node);
 	spin_unlock_irqrestore(&rk_domain->iommus_lock, flags);
 
-	/* Ignore error while disabling, just keep going */
-	rk_iommu_enable_stall(iommu);
-	rk_iommu_disable_paging(iommu);
-	rk_iommu_write(iommu, RK_MMU_INT_MASK, 0);
-	rk_iommu_write(iommu, RK_MMU_DTE_ADDR, 0);
-	rk_iommu_disable_stall(iommu);
+	spin_lock_irqsave(&iommu->hw_lock, flags);
+	if (iommu->is_powered)
+		rk_iommu_shutdown(iommu);
+	iommu->domain = NULL;
+	spin_unlock_irqrestore(&iommu->hw_lock, flags);
 
 	devm_free_irq(dev, iommu->irq, iommu);
 
-	iommu->domain = NULL;
-
 	dev_info(dev, "Detached from iommu domain\n");
 }
 
@@ -964,6 +1031,57 @@  static const struct iommu_ops rk_iommu_ops = {
 	.pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP,
 };
 
+static int rk_iommu_power_on(struct rk_iommu *iommu)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&iommu->hw_lock, flags);
+
+	iommu->is_powered = true;
+	if (iommu->domain)
+		ret = rk_iommu_startup(iommu);
+
+	spin_unlock_irqrestore(&iommu->hw_lock, flags);
+
+	return ret;
+}
+
+static void rk_iommu_power_off(struct rk_iommu *iommu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->hw_lock, flags);
+
+	if (iommu->domain)
+		rk_iommu_shutdown(iommu);
+	iommu->is_powered = false;
+
+	spin_unlock_irqrestore(&iommu->hw_lock, flags);
+}
+
+static int rk_iommu_genpd_notify(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct rk_iommu *iommu = container_of(nb, struct rk_iommu, genpd_nb);
+	int ret = 0;
+
+	switch (action) {
+	case PM_GENPD_POST_POWER_ON:
+		ret = rk_iommu_power_on(iommu);
+		break;
+
+	case PM_GENPD_POWER_OFF_PREPARE:
+		rk_iommu_power_off(iommu);
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return notifier_from_errno(ret);
+}
+
 static int rk_iommu_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -976,6 +1094,7 @@  static int rk_iommu_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, iommu);
 	iommu->dev = dev;
+	spin_lock_init(&iommu->hw_lock);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	iommu->base = devm_ioremap_resource(&pdev->dev, res);
@@ -988,11 +1107,28 @@  static int rk_iommu_probe(struct platform_device *pdev)
 		return -ENXIO;
 	}
 
+	pm_runtime_no_callbacks(dev);
+	pm_runtime_enable(dev);
+
+	/* Synchronize state of the domain with driver data. */
+	pm_runtime_get_sync(dev);
+	iommu->is_powered = true;
+
+	iommu->genpd_nb.notifier_call = rk_iommu_genpd_notify;
+	pm_genpd_register_notifier(dev, &iommu->genpd_nb);
+
+	pm_runtime_put(dev);
+
 	return 0;
 }
 
 static int rk_iommu_remove(struct platform_device *pdev)
 {
+	struct rk_iommu *iommu = platform_get_drvdata(pdev);
+
+	pm_genpd_unregister_notifier(iommu->dev, &iommu->genpd_nb);
+	pm_runtime_disable(&pdev->dev);
+
 	return 0;
 }