diff mbox series

[v5,5/5] PCI: qcom: Add OPP support to scale performance state of power domain

Message ID 1694066433-8677-6-git-send-email-quic_krichai@quicinc.com (mailing list archive)
State New
Delegated to: viresh kumar
Headers show
Series PCI: qcom: Add support for OPP | expand

Commit Message

Krishna chaitanya chundru Sept. 7, 2023, 6 a.m. UTC
While scaling the interconnect clocks based on PCIe link speed, it is also
mandatory to scale the power domain performance state so that the SoC can
run under optimum power conditions.

Signed-off-by: Krishna chaitanya chundru <quic_krichai@quicinc.com>
---
 drivers/pci/controller/dwc/pcie-qcom.c | 58 ++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 9 deletions(-)

Comments

Viresh Kumar Sept. 28, 2023, 6:31 a.m. UTC | #1
Adding everyone back, reply to you only by mistake earlier :(

On 28-09-23, 08:53, Krishna Chaitanya Chundru wrote:
> 
> On 9/27/2023 12:23 PM, Viresh Kumar wrote:
> > On 07-09-23, 11:30, Krishna chaitanya chundru wrote:
> > > While scaling the interconnect clocks based on PCIe link speed, it is also
> > > mandatory to scale the power domain performance state so that the SoC can
> > > run under optimum power conditions.
> > Why aren't you scaling interconnect bw via OPP core too ?
> 
> The power domain performance state varies from PCIe instance to instance and
> from target to target,
> 
> whereas interconnect BW remains same and changes only based upon PCIe GEN
> speed. So in the driver code itself
> 
> based upon GEN speed we are calculating the BW and voting for it.
> 
> That is the reason we are not scaling interconnect BW through OPP as no dt
> entries required for this.

Not sure I understand it fully yet. I tried looking at code and this is what I
see:

At probe initialization, you just configure bw.

Later on, towards end of probe and resume, you set both bw and performance
state.

Also your DT changes add virtual level numbers to PCIe OPP table like this:
+                               opp-1 {
+                                       opp-level = <1>;
+                                       required-opps = <&rpmhpd_opp_low_svs>;
+                               };

Instead what you can do here is, add bw values and remove level completely (as
it is not serving any meaningful purpose) and use the OPP core to set both bw
and performance state (via required OPPs).

What won't work if you do this ?
Manivannan Sadhasivam Nov. 1, 2023, 6:33 a.m. UTC | #2
On Thu, Sep 07, 2023 at 11:30:33AM +0530, Krishna chaitanya chundru wrote:
> While scaling the interconnect clocks based on PCIe link speed, it is also
> mandatory to scale the power domain performance state so that the SoC can
> run under optimum power conditions.
> 
> Signed-off-by: Krishna chaitanya chundru <quic_krichai@quicinc.com>
> ---
>  drivers/pci/controller/dwc/pcie-qcom.c | 58 ++++++++++++++++++++++++++++------
>  1 file changed, 49 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
> index ca6350b..1817e96 100644
> --- a/drivers/pci/controller/dwc/pcie-qcom.c
> +++ b/drivers/pci/controller/dwc/pcie-qcom.c
> @@ -22,6 +22,7 @@
>  #include <linux/of.h>
>  #include <linux/of_gpio.h>
>  #include <linux/pci.h>
> +#include <linux/pm_opp.h>
>  #include <linux/pm_runtime.h>
>  #include <linux/platform_device.h>
>  #include <linux/phy/pcie.h>
> @@ -240,6 +241,7 @@ struct qcom_pcie {
>  	const struct qcom_pcie_cfg *cfg;
>  	struct dentry *debugfs;
>  	bool suspended;
> +	bool opp_supported;
>  };
>  
>  #define to_qcom_pcie(x)		dev_get_drvdata((x)->dev)
> @@ -1357,14 +1359,13 @@ static int qcom_pcie_icc_init(struct qcom_pcie *pcie)
>  	return 0;
>  }
>  
> -static int qcom_pcie_icc_update(struct qcom_pcie *pcie)
> +static int qcom_pcie_icc_opp_update(struct qcom_pcie *pcie)
>  {
>  	struct dw_pcie *pci = pcie->pci;
> +	struct dev_pm_opp *opp;
>  	u32 offset, status, bw;
>  	int speed, width;
> -
> -	if (!pcie->icc_mem)
> -		return 0;
> +	int ret;
>  
>  	offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
>  	status = readw(pci->dbi_base + offset + PCI_EXP_LNKSTA);
> @@ -1391,7 +1392,21 @@ static int qcom_pcie_icc_update(struct qcom_pcie *pcie)
>  		break;
>  	}
>  
> -	return icc_set_bw(pcie->icc_mem, 0, width * bw);
> +	if (pcie->opp_supported) {
> +		opp = dev_pm_opp_find_level_exact(pci->dev, speed);
> +		if (!IS_ERR(opp)) {
> +			ret = dev_pm_opp_set_opp(pci->dev, opp);
> +			if (ret)
> +				dev_err(pci->dev, "Failed to set opp: level %d ret %d\n",
> +					dev_pm_opp_get_level(opp), ret);
> +			dev_pm_opp_put(opp);
> +		}
> +	}
> +
> +	if (pcie->icc_mem)
> +		ret = icc_set_bw(pcie->icc_mem, 0, width * bw);

I think you should tie interconnect scaling with OPP as suggested by Viresh,
since you are updating both OPP and BW at the same time.

- Mani

> +
> +	return ret;
>  }
>  
>  static int qcom_pcie_link_transition_count(struct seq_file *s, void *data)
> @@ -1434,8 +1449,10 @@ static void qcom_pcie_init_debugfs(struct qcom_pcie *pcie)
>  static int qcom_pcie_probe(struct platform_device *pdev)
>  {
>  	const struct qcom_pcie_cfg *pcie_cfg;
> +	unsigned long max_level = INT_MAX;
>  	struct device *dev = &pdev->dev;
>  	struct qcom_pcie *pcie;
> +	struct dev_pm_opp *opp;
>  	struct dw_pcie_rp *pp;
>  	struct resource *res;
>  	struct dw_pcie *pci;
> @@ -1506,6 +1523,27 @@ static int qcom_pcie_probe(struct platform_device *pdev)
>  	if (ret)
>  		goto err_pm_runtime_put;
>  
> +	/* OPP table is optional */
> +	ret = devm_pm_opp_of_add_table(dev);
> +	if (ret && ret != -ENODEV) {
> +		dev_err_probe(dev, ret, "Failed to add OPP table\n");
> +		goto err_pm_runtime_put;
> +	}
> +
> +	/* vote for max level in the opp table if opp table is present */
> +	if (ret != -ENODEV) {
> +		opp = dev_pm_opp_find_level_floor(dev, &max_level);
> +		if (!IS_ERR(opp)) {
> +			ret = dev_pm_opp_set_opp(dev, opp);
> +			if (ret)
> +				dev_err_probe(pci->dev, ret,
> +					      "Failed to set opp: level %d\n",
> +					      dev_pm_opp_get_level(opp));
> +			dev_pm_opp_put(opp);
> +		}
> +		pcie->opp_supported = true;
> +	}
> +
>  	ret = pcie->cfg->ops->get_resources(pcie);
>  	if (ret)
>  		goto err_pm_runtime_put;
> @@ -1524,9 +1562,9 @@ static int qcom_pcie_probe(struct platform_device *pdev)
>  		goto err_phy_exit;
>  	}
>  
> -	ret = qcom_pcie_icc_update(pcie);
> +	ret = qcom_pcie_icc_opp_update(pcie);
>  	if (ret)
> -		dev_err(dev, "failed to update interconnect bandwidth: %d\n",
> +		dev_err(dev, "failed to update interconnect bandwidth/opp: %d\n",
>  			ret);
>  
>  	if (pcie->mhi)
> @@ -1575,6 +1613,8 @@ static int qcom_pcie_suspend_noirq(struct device *dev)
>  	 */
>  	if (!dw_pcie_link_up(pcie->pci)) {
>  		qcom_pcie_host_deinit(&pcie->pci->pp);
> +		if (pcie->opp_supported)
> +			dev_pm_opp_set_opp(dev, NULL);
>  		pcie->suspended = true;
>  	}
>  
> @@ -1594,9 +1634,9 @@ static int qcom_pcie_resume_noirq(struct device *dev)
>  		pcie->suspended = false;
>  	}
>  
> -	ret = qcom_pcie_icc_update(pcie);
> +	ret = qcom_pcie_icc_opp_update(pcie);
>  	if (ret)
> -		dev_err(dev, "failed to update interconnect bandwidth: %d\n",
> +		dev_err(dev, "failed to update interconnect bandwidth/opp: %d\n",
>  			ret);
>  
>  	return 0;
> -- 
> 2.7.4
>
Bjorn Helgaas Nov. 1, 2023, 10:17 p.m. UTC | #3
On Thu, Sep 07, 2023 at 11:30:33AM +0530, Krishna chaitanya chundru wrote:
> While scaling the interconnect clocks based on PCIe link speed, it is also
> mandatory to scale the power domain performance state so that the SoC can
> run under optimum power conditions.

Can you expand "OPP" somewhere so we know what it stands for?  I'm
sure everybody knows except me :)

This commit log says something is mandatory; can you phrase it so it
says what the patch actually *does*?  The subject is kind of a title,
and I think it's important for the log to make sense without the
subject, so it's OK if the log repeats part or all of the subject.

Bjorn
Viresh Kumar Nov. 2, 2023, 5:30 a.m. UTC | #4
On 01-11-23, 17:17, Bjorn Helgaas wrote:
> Can you expand "OPP" somewhere so we know what it stands for?  I'm
> sure everybody knows except me :)

It is "Operating Performance Points", defined here:

Documentation/power/opp.rst
Bjorn Helgaas Nov. 2, 2023, 12:09 p.m. UTC | #5
On Thu, Nov 02, 2023 at 11:00:13AM +0530, Viresh Kumar wrote:
> On 01-11-23, 17:17, Bjorn Helgaas wrote:
> > Can you expand "OPP" somewhere so we know what it stands for?  I'm
> > sure everybody knows except me :)
> 
> It is "Operating Performance Points", defined here:
> 
> Documentation/power/opp.rst

Thanks; I meant in the subject or commit log of the next revision, of
course.
Viresh Kumar Nov. 3, 2023, 5:12 a.m. UTC | #6
On 02-11-23, 07:09, Bjorn Helgaas wrote:
> On Thu, Nov 02, 2023 at 11:00:13AM +0530, Viresh Kumar wrote:
> > On 01-11-23, 17:17, Bjorn Helgaas wrote:
> > > Can you expand "OPP" somewhere so we know what it stands for?  I'm
> > > sure everybody knows except me :)
> > 
> > It is "Operating Performance Points", defined here:
> > 
> > Documentation/power/opp.rst
> 
> Thanks; I meant in the subject or commit log of the next revision, of
> course.

Yeah, I understood that. Krishna shall do it in next version I believe.
Krishna chaitanya chundru Nov. 8, 2023, 2:32 a.m. UTC | #7
On 11/3/2023 10:42 AM, Viresh Kumar wrote:
> On 02-11-23, 07:09, Bjorn Helgaas wrote:
>> On Thu, Nov 02, 2023 at 11:00:13AM +0530, Viresh Kumar wrote:
>>> On 01-11-23, 17:17, Bjorn Helgaas wrote:
>>>> Can you expand "OPP" somewhere so we know what it stands for?  I'm
>>>> sure everybody knows except me :)
>>> It is "Operating Performance Points", defined here:
>>>
>>> Documentation/power/opp.rst
>> Thanks; I meant in the subject or commit log of the next revision, of
>> course.
> Yeah, I understood that. Krishna shall do it in next version I believe.
>
Hi All,

I will do this in my next patch both commit message and ICC voting 
through OPP

got stuck in some other work, will try to send new series as soon as 
possible.

- Krishna Chaitanya.
Krishna chaitanya chundru Jan. 8, 2024, 1:19 p.m. UTC | #8
On 11/8/2023 8:02 AM, Krishna Chaitanya Chundru wrote:
>
> On 11/3/2023 10:42 AM, Viresh Kumar wrote:
>> On 02-11-23, 07:09, Bjorn Helgaas wrote:
>>> On Thu, Nov 02, 2023 at 11:00:13AM +0530, Viresh Kumar wrote:
>>>> On 01-11-23, 17:17, Bjorn Helgaas wrote:
>>>>> Can you expand "OPP" somewhere so we know what it stands for?  I'm
>>>>> sure everybody knows except me :)
>>>> It is "Operating Performance Points", defined here:
>>>>
>>>> Documentation/power/opp.rst
>>> Thanks; I meant in the subject or commit log of the next revision, of
>>> course.
>> Yeah, I understood that. Krishna shall do it in next version I believe.
>>
> Hi All,
>
> I will do this in my next patch both commit message and ICC voting 
> through OPP
>
> got stuck in some other work, will try to send new series as soon as 
> possible.
>
> - Krishna Chaitanya.
>
Hi Viresh,

Sorry for late response.

We calculate ICC BW voting based up on PCIe speed and PCIe width.

Right now we are adding the opp table based up on PCIe speed.

Each PCIe controller can support multiple lane configurations like x1, 
x2, x4, x8, x16 based up on controller capability.

So for each GEN speed we need  up to 5 entries in OPP table. This will 
make OPP table very long.

It is best to calculate the ICC BW voting in the driver itself and apply 
them through ICC driver.

Let me know your opinion on this.

Thanks & Regards,

Krishna Chaitanya.
Viresh Kumar Jan. 10, 2024, 6:57 a.m. UTC | #9
On 08-01-24, 18:49, Krishna Chaitanya Chundru wrote:
> We calculate ICC BW voting based up on PCIe speed and PCIe width.
> 
> Right now we are adding the opp table based up on PCIe speed.
> 
> Each PCIe controller can support multiple lane configurations like x1, x2,
> x4, x8, x16 based up on controller capability.
> 
> So for each GEN speed we need  up to 5 entries in OPP table. This will make
> OPP table very long.
> 
> It is best to calculate the ICC BW voting in the driver itself and apply
> them through ICC driver.

I see. Are the lane configurations fixed for a platform ? I mean, do you change
those configurations at runtime or is that something that never changes, but the
driver can end up getting used on a hardware that supports any one of them ?

If they are fixed (second case), then you can use dev_pm_opp_set_prop_name() to
make that easier for you. With that you will only need 5 OPP entries, but each
of them will have five values of bw:

bw-x1, bw-x2, ....  and you can select one of them during initialization.
Krishna chaitanya chundru Jan. 10, 2024, 7:12 a.m. UTC | #10
On 1/10/2024 12:27 PM, Viresh Kumar wrote:
> On 08-01-24, 18:49, Krishna Chaitanya Chundru wrote:
>> We calculate ICC BW voting based up on PCIe speed and PCIe width.
>>
>> Right now we are adding the opp table based up on PCIe speed.
>>
>> Each PCIe controller can support multiple lane configurations like x1, x2,
>> x4, x8, x16 based up on controller capability.
>>
>> So for each GEN speed we need  up to 5 entries in OPP table. This will make
>> OPP table very long.
>>
>> It is best to calculate the ICC BW voting in the driver itself and apply
>> them through ICC driver.
> I see. Are the lane configurations fixed for a platform ? I mean, do you change
> those configurations at runtime or is that something that never changes, but the
> driver can end up getting used on a hardware that supports any one of them ?
>
> If they are fixed (second case), then you can use dev_pm_opp_set_prop_name() to
> make that easier for you. With that you will only need 5 OPP entries, but each
> of them will have five values of bw:
>
> bw-x1, bw-x2, ....  and you can select one of them during initialization.

Hi Viresh,

At present we are not changing the link width after link is initialized, 
but we have plans to

add support change link width dynamically at runtime.

So, I think it is better to have ICC BW voting in the driver itself.

Thanks & Regards,

Krishna Chaitanya.
Viresh Kumar Jan. 10, 2024, 7:38 a.m. UTC | #11
On 10-01-24, 12:42, Krishna Chaitanya Chundru wrote:
> At present we are not changing the link width after link is initialized, but
> we have plans to
> 
> add support change link width dynamically at runtime.

Hmm okay.

> So, I think it is better to have ICC BW voting in the driver itself.

I guess it is better to have more entries in the OPP table then.. 15-20 OPPs
isn't too many to be honest.

Replicating code is the last thing I would like to do.

Maybe you can show the different layouts of the OPP table if you are concerned.
We can then see if it is getting too much or not.
Krishna chaitanya chundru Jan. 10, 2024, 12:58 p.m. UTC | #12
On 1/10/2024 1:08 PM, Viresh Kumar wrote:
> On 10-01-24, 12:42, Krishna Chaitanya Chundru wrote:
>> At present we are not changing the link width after link is initialized, but
>> we have plans to
>>
>> add support change link width dynamically at runtime.
> Hmm okay.
>
>> So, I think it is better to have ICC BW voting in the driver itself.
> I guess it is better to have more entries in the OPP table then.. 15-20 OPPs
> isn't too many to be honest.
>
> Replicating code is the last thing I would like to do.
>
> Maybe you can show the different layouts of the OPP table if you are concerned.
> We can then see if it is getting too much or not.

Viresh,

it might be less only for now may be around 20 opp entries, but PCIe 
spec is being updated every few years and a new gen

gen speed will release, right now PCIe GEN6 is released but I don't we 
had any device in the market now and GEN7 is in process.

So in future it might become very big table. Either we need to come up 
with a framework in the OPP to select the BW based up on lane width

for particular speed or use the driver way.


Thanks & Regards,

Krishna Chaitanya.
Viresh Kumar Jan. 11, 2024, 3:32 a.m. UTC | #13
On 10-01-24, 18:28, Krishna Chaitanya Chundru wrote:
> it might be less only for now may be around 20 opp entries, but PCIe spec is
> being updated every few years and a new gen
> 
> gen speed will release, right now PCIe GEN6 is released but I don't we had
> any device in the market now and GEN7 is in process.
> 
> So in future it might become very big table. Either we need to come up with
> a framework in the OPP to select the BW based up on lane width
> 
> for particular speed or use the driver way.

Lets solve the problem the right (current) way for right now and revisit the
whole thing when it gets complex ? So I would suggest configuring the bw via the
OPP framework only, since it takes care of that for all other device types too.

We can surely revisit and try to do it differently if we find some issues going
forward.
diff mbox series

Patch

diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index ca6350b..1817e96 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -22,6 +22,7 @@ 
 #include <linux/of.h>
 #include <linux/of_gpio.h>
 #include <linux/pci.h>
+#include <linux/pm_opp.h>
 #include <linux/pm_runtime.h>
 #include <linux/platform_device.h>
 #include <linux/phy/pcie.h>
@@ -240,6 +241,7 @@  struct qcom_pcie {
 	const struct qcom_pcie_cfg *cfg;
 	struct dentry *debugfs;
 	bool suspended;
+	bool opp_supported;
 };
 
 #define to_qcom_pcie(x)		dev_get_drvdata((x)->dev)
@@ -1357,14 +1359,13 @@  static int qcom_pcie_icc_init(struct qcom_pcie *pcie)
 	return 0;
 }
 
-static int qcom_pcie_icc_update(struct qcom_pcie *pcie)
+static int qcom_pcie_icc_opp_update(struct qcom_pcie *pcie)
 {
 	struct dw_pcie *pci = pcie->pci;
+	struct dev_pm_opp *opp;
 	u32 offset, status, bw;
 	int speed, width;
-
-	if (!pcie->icc_mem)
-		return 0;
+	int ret;
 
 	offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
 	status = readw(pci->dbi_base + offset + PCI_EXP_LNKSTA);
@@ -1391,7 +1392,21 @@  static int qcom_pcie_icc_update(struct qcom_pcie *pcie)
 		break;
 	}
 
-	return icc_set_bw(pcie->icc_mem, 0, width * bw);
+	if (pcie->opp_supported) {
+		opp = dev_pm_opp_find_level_exact(pci->dev, speed);
+		if (!IS_ERR(opp)) {
+			ret = dev_pm_opp_set_opp(pci->dev, opp);
+			if (ret)
+				dev_err(pci->dev, "Failed to set opp: level %d ret %d\n",
+					dev_pm_opp_get_level(opp), ret);
+			dev_pm_opp_put(opp);
+		}
+	}
+
+	if (pcie->icc_mem)
+		ret = icc_set_bw(pcie->icc_mem, 0, width * bw);
+
+	return ret;
 }
 
 static int qcom_pcie_link_transition_count(struct seq_file *s, void *data)
@@ -1434,8 +1449,10 @@  static void qcom_pcie_init_debugfs(struct qcom_pcie *pcie)
 static int qcom_pcie_probe(struct platform_device *pdev)
 {
 	const struct qcom_pcie_cfg *pcie_cfg;
+	unsigned long max_level = INT_MAX;
 	struct device *dev = &pdev->dev;
 	struct qcom_pcie *pcie;
+	struct dev_pm_opp *opp;
 	struct dw_pcie_rp *pp;
 	struct resource *res;
 	struct dw_pcie *pci;
@@ -1506,6 +1523,27 @@  static int qcom_pcie_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_pm_runtime_put;
 
+	/* OPP table is optional */
+	ret = devm_pm_opp_of_add_table(dev);
+	if (ret && ret != -ENODEV) {
+		dev_err_probe(dev, ret, "Failed to add OPP table\n");
+		goto err_pm_runtime_put;
+	}
+
+	/* vote for max level in the opp table if opp table is present */
+	if (ret != -ENODEV) {
+		opp = dev_pm_opp_find_level_floor(dev, &max_level);
+		if (!IS_ERR(opp)) {
+			ret = dev_pm_opp_set_opp(dev, opp);
+			if (ret)
+				dev_err_probe(pci->dev, ret,
+					      "Failed to set opp: level %d\n",
+					      dev_pm_opp_get_level(opp));
+			dev_pm_opp_put(opp);
+		}
+		pcie->opp_supported = true;
+	}
+
 	ret = pcie->cfg->ops->get_resources(pcie);
 	if (ret)
 		goto err_pm_runtime_put;
@@ -1524,9 +1562,9 @@  static int qcom_pcie_probe(struct platform_device *pdev)
 		goto err_phy_exit;
 	}
 
-	ret = qcom_pcie_icc_update(pcie);
+	ret = qcom_pcie_icc_opp_update(pcie);
 	if (ret)
-		dev_err(dev, "failed to update interconnect bandwidth: %d\n",
+		dev_err(dev, "failed to update interconnect bandwidth/opp: %d\n",
 			ret);
 
 	if (pcie->mhi)
@@ -1575,6 +1613,8 @@  static int qcom_pcie_suspend_noirq(struct device *dev)
 	 */
 	if (!dw_pcie_link_up(pcie->pci)) {
 		qcom_pcie_host_deinit(&pcie->pci->pp);
+		if (pcie->opp_supported)
+			dev_pm_opp_set_opp(dev, NULL);
 		pcie->suspended = true;
 	}
 
@@ -1594,9 +1634,9 @@  static int qcom_pcie_resume_noirq(struct device *dev)
 		pcie->suspended = false;
 	}
 
-	ret = qcom_pcie_icc_update(pcie);
+	ret = qcom_pcie_icc_opp_update(pcie);
 	if (ret)
-		dev_err(dev, "failed to update interconnect bandwidth: %d\n",
+		dev_err(dev, "failed to update interconnect bandwidth/opp: %d\n",
 			ret);
 
 	return 0;