diff mbox series

[v5,2/2] Export Power Budgeting Extended Capability into pci-bus-sysfs

Message ID 20241210040826.11402-3-kobayashi.da-06@fujitsu.com (mailing list archive)
State Changes Requested
Delegated to: Krzysztof WilczyƄski
Headers show
Series Export PBEC Data register into sysfs | expand

Commit Message

Daisuke Kobayashi (Fujitsu) Dec. 10, 2024, 4:08 a.m. UTC
Add support for PBEC (Power Budgeting Extended Capability) output 
to the PCIe driver. PBEC is defined in the 
PCIe specification(PCIe r6.0, sec 7.8.1) and is
a standard method for obtaining device power consumption information.

Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
---
 Documentation/ABI/testing/sysfs-bus-pci |  62 ++++++++
 drivers/pci/pci-sysfs.c                 | 179 ++++++++++++++++++++++++
 2 files changed, 241 insertions(+)

Comments

Jonathan Cameron Dec. 11, 2024, 5:43 p.m. UTC | #1
On Tue, 10 Dec 2024 13:08:21 +0900
"Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com> wrote:

> Add support for PBEC (Power Budgeting Extended Capability) output 
> to the PCIe driver. PBEC is defined in the 
> PCIe specification(PCIe r6.0, sec 7.8.1) and is
> a standard method for obtaining device power consumption information.

I'm curious why you chose to drive the interface from sysfs as opposed to just
querying how many there are boot and providing separate files for each data
register, potentially a set of files for each one for better readability.

Slightly irritating is that there is no 'count' register so you'd have to
walk up or bisect just to find out how many there actually readable.
Then use is_visible() to hide the remainder.

Look like it is an 8 bit data select register, so maybe 256 max?
That would give you things like 
power_budget0_power
power_budget0_pm_state 
etc
Or maybe drop the leading power given where we are so
budget0_power etc

Mailboxes via sysfs are always a bit messy as there isn't really any
locking etc.

Sorry I'm coming in rather late with this query given you are already at v5!

Jonathan

> 
> Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
> ---
>  Documentation/ABI/testing/sysfs-bus-pci |  62 ++++++++
>  drivers/pci/pci-sysfs.c                 | 179 ++++++++++++++++++++++++
>  2 files changed, 241 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> index 7f63c7e97773..ec417ae20bc1 100644
> --- a/Documentation/ABI/testing/sysfs-bus-pci
> +++ b/Documentation/ABI/testing/sysfs-bus-pci
> @@ -572,3 +572,65 @@ Description:
>  		enclosure-specific indications "specific0" to "specific7",
>  		hence the corresponding led class devices are unavailable if
>  		the DSM interface is used.
> +
> +What:		/sys/bus/pci/devices/.../power_budget
> +Date:		December 2024
> +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> +Description:
> +		This file provides information about the PCIe power budget
> +		for the device. It is a read-only file that outputs the values
> +		of the Power Budgeting Data Register for each power state as a
> +		series of 32-bit hexadecimal values. Each line represents a
> +		single Power Budgeting Data register entry, containing the
> +		power budget for a specific power state.
> +
> +		The specific interpretation of these values depends on the
> +		device and the PCIe specification. Refer to the PCIe
> +		specification for detailed information about the Power
> +		Budgeting Data register, including the encoding	of power
> +		states and the interpretation of Base Power and Data Scale.
> +
> +What:		/sys/bus/pci/devices/.../power_budget/power_budget_data_select
> +Date:		December 2024
> +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> +Description:
> +		This is an 8-bit read/write register that selects the DWORD of 
> +		power budgeting data that will be displayed in the
> +		Power Budgeting Data. The value starts at zero and incrementing
> +		the index value selects the next DWORD.
> +
> +What:		/sys/bus/pci/devices/.../power_budget/power_budget_power
> +Date:		December 2024
> +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> +Description:
> +		This file provides the power consumption calculated by
> +		multiplying the base power by the data scale.
> +
> +
> +What:		/sys/bus/pci/devices/.../power_budget/power_budget_pm_state
> +Date:		December 2024
> +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> +Description:
> +		This file specifies the power management state of the operating
> +		condition.
> +
> +What:		/sys/bus/pci/devices/.../power_budget/power_budget_pm_substate
> +Date:		December 2024
> +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> +Description:
> +		This file specifies the power management sub state of the
> +		operating condition.
> +
> +What:		/sys/bus/pci/devices/.../power_budget/power_budget_type
> +Date:		December 2024
> +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> +Description:
> +		This file specifies the type of the operating condition.
> +
> +
> +What:		/sys/bus/pci/devices/.../power_budget/power_budget_rail
> +Date:		December 2024
> +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> +Description:
> +		This file Specifies the thermal load or power rail of the
> +		operating condition.
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index 5d0f4db1cab7..89909633ad02 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -238,6 +238,155 @@ static ssize_t current_link_width_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(current_link_width);

> +
> +static ssize_t power_budget_rail_show(struct device *dev,
> +				       struct device_attribute *attr, char *buf)
> +{
> +	struct pci_dev *pci_dev = to_pci_dev(dev);
> +	int pos, err;
> +	u32 data;
> +
> +	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);

IIRC, this is not a cheap operation. I'd suggest storing it like we do for aer_cap
etc.

> +	if (!pos)
> +		return -EINVAL;
> +
> +	err = pci_read_config_dword(pci_dev, pos + PCI_PWR_DATA, &data);
> +	if (err)
> +		return -EINVAL;
> +
> +	return sysfs_emit(buf, "%s\n", 
> +				pci_power_budget_rail_string(PCI_PWR_DATA_RAIL(data)));

Unusual indenting. Align it below b

> +}
> +static DEVICE_ATTR_RO(power_budget_rail);
> +
>  static ssize_t secondary_bus_number_show(struct device *dev,
>  					 struct device_attribute *attr,
>  					 char *buf)
> @@ -636,6 +785,16 @@ static struct attribute *pcie_dev_attrs[] = {
>  	NULL,
>  };
>  
> +static struct attribute *pcie_pbec_attrs[] = {
> +	&dev_attr_power_budget_data_select.attr,
> +	&dev_attr_power_budget_power.attr,
> +	&dev_attr_power_budget_pm_state.attr,
> +	&dev_attr_power_budget_pm_substate.attr,
> +	&dev_attr_power_budget_rail.attr,
> +	&dev_attr_power_budget_type.attr,
> +	NULL,
No need for comma on terminating entries as we will never add anything after them.

> +};
Daisuke Kobayashi (Fujitsu) Dec. 12, 2024, 9:08 a.m. UTC | #2
Jonathan Cameron wrote:
> On Tue, 10 Dec 2024 13:08:21 +0900
> "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com> wrote:
> 
> > Add support for PBEC (Power Budgeting Extended Capability) output to
> > the PCIe driver. PBEC is defined in the PCIe specification(PCIe r6.0,
> > sec 7.8.1) and is a standard method for obtaining device power
> > consumption information.
> 
> I'm curious why you chose to drive the interface from sysfs as opposed to just
> querying how many there are boot and providing separate files for each data
> register, potentially a set of files for each one for better readability.
> 
> Slightly irritating is that there is no 'count' register so you'd have to walk up or
> bisect just to find out how many there actually readable.
> Then use is_visible() to hide the remainder.
> 
> Look like it is an 8 bit data select register, so maybe 256 max?
> That would give you things like
> power_budget0_power
> power_budget0_pm_state
> etc
> Or maybe drop the leading power given where we are so budget0_power etc
> 
> Mailboxes via sysfs are always a bit messy as there isn't really any locking etc.
> 
> Sorry I'm coming in rather late with this query given you are already at v5!
> 
> Jonathan
> 
Thank you for your valuable feedback and for taking the time to review my patch.

Regarding your proposed file structure:

budget0_power
budget0_pm_state
...
budget1_power
budget1_pm_state
...
budget2_xxx
...

Is this the correct format?

In my understanding, implementing this would require 256 attributes corresponding to
the DataSelectRegister, which seems overly redundant. Additionally, I was concerned
about the mechanism of changing the DataSelectRegister each time a file is read.

Therefore, I proposed this patch implementation, even though it might not be ideal.

If you have a suitable implementation to achieve your proposed structure, 
I would greatly appreciate it if you could share it.

Upon further reflection, the issue of needing to change the DataSelectRegister could 
potentially be avoided by initially retrieving and storing the register values for all indices.

Thank you for pointing out the coding style issue, I'll fix it in the next patch.

> >
> > Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
> > ---
> >  Documentation/ABI/testing/sysfs-bus-pci |  62 ++++++++
> >  drivers/pci/pci-sysfs.c                 | 179
> ++++++++++++++++++++++++
> >  2 files changed, 241 insertions(+)
> >
> > diff --git a/Documentation/ABI/testing/sysfs-bus-pci
> > b/Documentation/ABI/testing/sysfs-bus-pci
> > index 7f63c7e97773..ec417ae20bc1 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > @@ -572,3 +572,65 @@ Description:
> >  		enclosure-specific indications "specific0" to "specific7",
> >  		hence the corresponding led class devices are unavailable if
> >  		the DSM interface is used.
> > +
> > +What:		/sys/bus/pci/devices/.../power_budget
> > +Date:		December 2024
> > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > +Description:
> > +		This file provides information about the PCIe power budget
> > +		for the device. It is a read-only file that outputs the values
> > +		of the Power Budgeting Data Register for each power state as
> a
> > +		series of 32-bit hexadecimal values. Each line represents a
> > +		single Power Budgeting Data register entry, containing the
> > +		power budget for a specific power state.
> > +
> > +		The specific interpretation of these values depends on the
> > +		device and the PCIe specification. Refer to the PCIe
> > +		specification for detailed information about the Power
> > +		Budgeting Data register, including the encoding	of power
> > +		states and the interpretation of Base Power and Data Scale.
> > +
> > +What:
> 	/sys/bus/pci/devices/.../power_budget/power_budget_data_select
> > +Date:		December 2024
> > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > +Description:
> > +		This is an 8-bit read/write register that selects the DWORD of
> > +		power budgeting data that will be displayed in the
> > +		Power Budgeting Data. The value starts at zero and
> incrementing
> > +		the index value selects the next DWORD.
> > +
> > +What:
> 	/sys/bus/pci/devices/.../power_budget/power_budget_power
> > +Date:		December 2024
> > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > +Description:
> > +		This file provides the power consumption calculated by
> > +		multiplying the base power by the data scale.
> > +
> > +
> > +What:
> 	/sys/bus/pci/devices/.../power_budget/power_budget_pm_state
> > +Date:		December 2024
> > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > +Description:
> > +		This file specifies the power management state of the
> operating
> > +		condition.
> > +
> > +What:
> 	/sys/bus/pci/devices/.../power_budget/power_budget_pm_substat
> e
> > +Date:		December 2024
> > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > +Description:
> > +		This file specifies the power management sub state of the
> > +		operating condition.
> > +
> > +What:
> 	/sys/bus/pci/devices/.../power_budget/power_budget_type
> > +Date:		December 2024
> > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > +Description:
> > +		This file specifies the type of the operating condition.
> > +
> > +
> > +What:
> 	/sys/bus/pci/devices/.../power_budget/power_budget_rail
> > +Date:		December 2024
> > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > +Description:
> > +		This file Specifies the thermal load or power rail of the
> > +		operating condition.
> > diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index
> > 5d0f4db1cab7..89909633ad02 100644
> > --- a/drivers/pci/pci-sysfs.c
> > +++ b/drivers/pci/pci-sysfs.c
> > @@ -238,6 +238,155 @@ static ssize_t current_link_width_show(struct
> > device *dev,  }  static DEVICE_ATTR_RO(current_link_width);
> 
> > +
> > +static ssize_t power_budget_rail_show(struct device *dev,
> > +				       struct device_attribute *attr, char
> *buf) {
> > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > +	int pos, err;
> > +	u32 data;
> > +
> > +	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);
> 
> IIRC, this is not a cheap operation. I'd suggest storing it like we do for aer_cap
> etc.
> 
> > +	if (!pos)
> > +		return -EINVAL;
> > +
> > +	err = pci_read_config_dword(pci_dev, pos + PCI_PWR_DATA, &data);
> > +	if (err)
> > +		return -EINVAL;
> > +
> > +	return sysfs_emit(buf, "%s\n",
> > +
> 	pci_power_budget_rail_string(PCI_PWR_DATA_RAIL(data)));
> 
> Unusual indenting. Align it below b
> 
> > +}
> > +static DEVICE_ATTR_RO(power_budget_rail);
> > +
> >  static ssize_t secondary_bus_number_show(struct device *dev,
> >  					 struct device_attribute *attr,
> >  					 char *buf)
> > @@ -636,6 +785,16 @@ static struct attribute *pcie_dev_attrs[] = {
> >  	NULL,
> >  };
> >
> > +static struct attribute *pcie_pbec_attrs[] = {
> > +	&dev_attr_power_budget_data_select.attr,
> > +	&dev_attr_power_budget_power.attr,
> > +	&dev_attr_power_budget_pm_state.attr,
> > +	&dev_attr_power_budget_pm_substate.attr,
> > +	&dev_attr_power_budget_rail.attr,
> > +	&dev_attr_power_budget_type.attr,
> > +	NULL,
> No need for comma on terminating entries as we will never add anything after
> them.
> 
> > +};
Jonathan Cameron Dec. 12, 2024, 12:47 p.m. UTC | #3
On Thu, 12 Dec 2024 09:08:54 +0000
"Daisuke Kobayashi (Fujitsu)" <kobayashi.da-06@fujitsu.com> wrote:

> Jonathan Cameron wrote:
> > On Tue, 10 Dec 2024 13:08:21 +0900
> > "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com> wrote:
> >   
> > > Add support for PBEC (Power Budgeting Extended Capability) output to
> > > the PCIe driver. PBEC is defined in the PCIe specification(PCIe r6.0,
> > > sec 7.8.1) and is a standard method for obtaining device power
> > > consumption information.  
> > 
> > I'm curious why you chose to drive the interface from sysfs as opposed to just
> > querying how many there are boot and providing separate files for each data
> > register, potentially a set of files for each one for better readability.
> > 
> > Slightly irritating is that there is no 'count' register so you'd have to walk up or
> > bisect just to find out how many there actually readable.
> > Then use is_visible() to hide the remainder.
> > 
> > Look like it is an 8 bit data select register, so maybe 256 max?
> > That would give you things like
> > power_budget0_power
> > power_budget0_pm_state
> > etc
> > Or maybe drop the leading power given where we are so budget0_power etc
> > 
> > Mailboxes via sysfs are always a bit messy as there isn't really any locking etc.
> > 
> > Sorry I'm coming in rather late with this query given you are already at v5!
> > 
> > Jonathan
> >   
> Thank you for your valuable feedback and for taking the time to review my patch.
> 
> Regarding your proposed file structure:
> 
> budget0_power
> budget0_pm_state
> ...
> budget1_power
> budget1_pm_state
> ...
> budget2_xxx
> ...
> 
> Is this the correct format?

Yes. That is what I had mind.

> 
> In my understanding, implementing this would require 256 attributes corresponding to
> the DataSelectRegister, which seems overly redundant. Additionally, I was concerned
> about the mechanism of changing the DataSelectRegister each time a file is read.

I agree it is quite a large number of files, but there are larger sysfs interfaces
so I'm not that concerned. Do you have any data on how many entries are typically used?
We could use an is_visible() callback and register all 256, or we could do dynamic
allocation though that would require walking to find out what is there.

The implementation should cache the current dataselectregister value and only write it
if the value needs to change.  Sensible userspace software would just read all
the elements of each budgetX_* before moving on to the next one.

> 
> Therefore, I proposed this patch implementation, even though it might not be ideal.
> 
> If you have a suitable implementation to achieve your proposed structure, 
> I would greatly appreciate it if you could share it.
> 
> Upon further reflection, the issue of needing to change the DataSelectRegister could 
> potentially be avoided by initially retrieving and storing the register values for all indices.

Cacheing these is definitely an option to explore.  I'm not sure we would want to pay that cost
at boot, but if not we could do it on first touch.  The 0 value is reserved anyway so
if cached copy is 0, read it from the device and fill the cache.
We would need to read some at boot to figure out which should be visible, but
that would be logN of max entries at worse, rather than all 256.  If there are only
a few it would make sense to just read them all at boot.

Annoying there isn't a count register for these!

Jonathan




> 
> Thank you for pointing out the coding style issue, I'll fix it in the next patch.
> 
> > >
> > > Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
> > > ---
> > >  Documentation/ABI/testing/sysfs-bus-pci |  62 ++++++++
> > >  drivers/pci/pci-sysfs.c                 | 179  
> > ++++++++++++++++++++++++  
> > >  2 files changed, 241 insertions(+)
> > >
> > > diff --git a/Documentation/ABI/testing/sysfs-bus-pci
> > > b/Documentation/ABI/testing/sysfs-bus-pci
> > > index 7f63c7e97773..ec417ae20bc1 100644
> > > --- a/Documentation/ABI/testing/sysfs-bus-pci
> > > +++ b/Documentation/ABI/testing/sysfs-bus-pci
> > > @@ -572,3 +572,65 @@ Description:
> > >  		enclosure-specific indications "specific0" to "specific7",
> > >  		hence the corresponding led class devices are unavailable if
> > >  		the DSM interface is used.
> > > +
> > > +What:		/sys/bus/pci/devices/.../power_budget
> > > +Date:		December 2024
> > > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > > +Description:
> > > +		This file provides information about the PCIe power budget
> > > +		for the device. It is a read-only file that outputs the values
> > > +		of the Power Budgeting Data Register for each power state as  
> > a  
> > > +		series of 32-bit hexadecimal values. Each line represents a
> > > +		single Power Budgeting Data register entry, containing the
> > > +		power budget for a specific power state.
> > > +
> > > +		The specific interpretation of these values depends on the
> > > +		device and the PCIe specification. Refer to the PCIe
> > > +		specification for detailed information about the Power
> > > +		Budgeting Data register, including the encoding	of power
> > > +		states and the interpretation of Base Power and Data Scale.
> > > +
> > > +What:  
> > 	/sys/bus/pci/devices/.../power_budget/power_budget_data_select  
> > > +Date:		December 2024
> > > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > > +Description:
> > > +		This is an 8-bit read/write register that selects the DWORD of
> > > +		power budgeting data that will be displayed in the
> > > +		Power Budgeting Data. The value starts at zero and  
> > incrementing  
> > > +		the index value selects the next DWORD.
> > > +
> > > +What:  
> > 	/sys/bus/pci/devices/.../power_budget/power_budget_power  
> > > +Date:		December 2024
> > > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > > +Description:
> > > +		This file provides the power consumption calculated by
> > > +		multiplying the base power by the data scale.
> > > +
> > > +
> > > +What:  
> > 	/sys/bus/pci/devices/.../power_budget/power_budget_pm_state  
> > > +Date:		December 2024
> > > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > > +Description:
> > > +		This file specifies the power management state of the  
> > operating  
> > > +		condition.
> > > +
> > > +What:  
> > 	/sys/bus/pci/devices/.../power_budget/power_budget_pm_substat
> > e  
> > > +Date:		December 2024
> > > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > > +Description:
> > > +		This file specifies the power management sub state of the
> > > +		operating condition.
> > > +
> > > +What:  
> > 	/sys/bus/pci/devices/.../power_budget/power_budget_type  
> > > +Date:		December 2024
> > > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > > +Description:
> > > +		This file specifies the type of the operating condition.
> > > +
> > > +
> > > +What:  
> > 	/sys/bus/pci/devices/.../power_budget/power_budget_rail  
> > > +Date:		December 2024
> > > +Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
> > > +Description:
> > > +		This file Specifies the thermal load or power rail of the
> > > +		operating condition.
> > > diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index
> > > 5d0f4db1cab7..89909633ad02 100644
> > > --- a/drivers/pci/pci-sysfs.c
> > > +++ b/drivers/pci/pci-sysfs.c
> > > @@ -238,6 +238,155 @@ static ssize_t current_link_width_show(struct
> > > device *dev,  }  static DEVICE_ATTR_RO(current_link_width);  
> >   
> > > +
> > > +static ssize_t power_budget_rail_show(struct device *dev,
> > > +				       struct device_attribute *attr, char  
> > *buf) {  
> > > +	struct pci_dev *pci_dev = to_pci_dev(dev);
> > > +	int pos, err;
> > > +	u32 data;
> > > +
> > > +	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);  
> > 
> > IIRC, this is not a cheap operation. I'd suggest storing it like we do for aer_cap
> > etc.
> >   
> > > +	if (!pos)
> > > +		return -EINVAL;
> > > +
> > > +	err = pci_read_config_dword(pci_dev, pos + PCI_PWR_DATA, &data);
> > > +	if (err)
> > > +		return -EINVAL;
> > > +
> > > +	return sysfs_emit(buf, "%s\n",
> > > +  
> > 	pci_power_budget_rail_string(PCI_PWR_DATA_RAIL(data)));
> > 
> > Unusual indenting. Align it below b
> >   
> > > +}
> > > +static DEVICE_ATTR_RO(power_budget_rail);
> > > +
> > >  static ssize_t secondary_bus_number_show(struct device *dev,
> > >  					 struct device_attribute *attr,
> > >  					 char *buf)
> > > @@ -636,6 +785,16 @@ static struct attribute *pcie_dev_attrs[] = {
> > >  	NULL,
> > >  };
> > >
> > > +static struct attribute *pcie_pbec_attrs[] = {
> > > +	&dev_attr_power_budget_data_select.attr,
> > > +	&dev_attr_power_budget_power.attr,
> > > +	&dev_attr_power_budget_pm_state.attr,
> > > +	&dev_attr_power_budget_pm_substate.attr,
> > > +	&dev_attr_power_budget_rail.attr,
> > > +	&dev_attr_power_budget_type.attr,
> > > +	NULL,  
> > No need for comma on terminating entries as we will never add anything after
> > them.
> >   
> > > +};  
>
Daisuke Kobayashi (Fujitsu) Dec. 13, 2024, 9:41 a.m. UTC | #4
Thank you for your detailed feedback and suggestions. Based on your comments,
I have a few thoughts and questions to ensure we move forward effectively.

Jonathan Cameron wrote:
> On Thu, 12 Dec 2024 09:08:54 +0000
> "Daisuke Kobayashi (Fujitsu)" <kobayashi.da-06@fujitsu.com> wrote:
> 
> > Jonathan Cameron wrote:
> > > On Tue, 10 Dec 2024 13:08:21 +0900
> > > "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com> wrote:

... snip ...
> > Regarding your proposed file structure:
> >
> > budget0_power
> > budget0_pm_state
> > ...
> > budget1_power
> > budget1_pm_state
> > ...
> > budget2_xxx
> > ...
> >
> > Is this the correct format?
> 
> Yes. That is what I had mind.
> 

Thanks, I see.

> >
> > In my understanding, implementing this would require 256 attributes
> > corresponding to the DataSelectRegister, which seems overly redundant.
> > Additionally, I was concerned about the mechanism of changing the
> DataSelectRegister each time a file is read.
> 
> I agree it is quite a large number of files, but there are larger sysfs interfaces so
> I'm not that concerned. Do you have any data on how many entries are typically
> used?
> We could use an is_visible() callback and register all 256, or we could do
> dynamic allocation though that would require walking to find out what is there.
> 
> The implementation should cache the current dataselectregister value and only
> write it if the value needs to change.  Sensible userspace software would just
> read all the elements of each budgetX_* before moving on to the next one.
> 

Unfortunately, I don't have specific data on the typical number of entries used.
However, I estimate that it would be within 10 at most.
Could you please provide examples of large sysfs interfaces and where they are
implemented? I would like to refer to them for implementation.
In pci-sysfs.c, there is an attribute called resourceX_resize, and I believe its basic
concept could be useful. However, the PowerBudget implementation is more complex
because it involves multiple sysfs outputs for each index.
If there are more reference implementations, I could propose a more mature implementation.

> >
> > Therefore, I proposed this patch implementation, even though it might not be
> ideal.
> >
> > If you have a suitable implementation to achieve your proposed
> > structure, I would greatly appreciate it if you could share it.
> >
> > Upon further reflection, the issue of needing to change the
> > DataSelectRegister could potentially be avoided by initially retrieving and
> storing the register values for all indices.
> 
> Cacheing these is definitely an option to explore.  I'm not sure we would want
> to pay that cost at boot, but if not we could do it on first touch.  The 0 value is
> reserved anyway so if cached copy is 0, read it from the device and fill the
> cache.
> We would need to read some at boot to figure out which should be visible, but
> that would be logN of max entries at worse, rather than all 256.  If there are only
> a few it would make sense to just read them all at boot.
> 
Although the exact number of indices is uncertain, I believe it is sufficiently small. 
Therefore, I plan to adopt a method where all values are read and cached at boot time. 
I intend to create an array in the struct pci_dev and initialize it in the probe function.
If there are any other suitable places or methods, please let me know.

> Annoying there isn't a count register for these!

I also feel that. Thank you.
> 
> Jonathan
> 
> 
> 
> 
> >
> > Thank you for pointing out the coding style issue, I'll fix it in the next patch.
> >
> > > >
> > > > Signed-off-by: "Kobayashi,Daisuke" <kobayashi.da-06@fujitsu.com>
... snip ...
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 7f63c7e97773..ec417ae20bc1 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -572,3 +572,65 @@  Description:
 		enclosure-specific indications "specific0" to "specific7",
 		hence the corresponding led class devices are unavailable if
 		the DSM interface is used.
+
+What:		/sys/bus/pci/devices/.../power_budget
+Date:		December 2024
+Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
+Description:
+		This file provides information about the PCIe power budget
+		for the device. It is a read-only file that outputs the values
+		of the Power Budgeting Data Register for each power state as a
+		series of 32-bit hexadecimal values. Each line represents a
+		single Power Budgeting Data register entry, containing the
+		power budget for a specific power state.
+
+		The specific interpretation of these values depends on the
+		device and the PCIe specification. Refer to the PCIe
+		specification for detailed information about the Power
+		Budgeting Data register, including the encoding	of power
+		states and the interpretation of Base Power and Data Scale.
+
+What:		/sys/bus/pci/devices/.../power_budget/power_budget_data_select
+Date:		December 2024
+Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
+Description:
+		This is an 8-bit read/write register that selects the DWORD of 
+		power budgeting data that will be displayed in the
+		Power Budgeting Data. The value starts at zero and incrementing
+		the index value selects the next DWORD.
+
+What:		/sys/bus/pci/devices/.../power_budget/power_budget_power
+Date:		December 2024
+Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
+Description:
+		This file provides the power consumption calculated by
+		multiplying the base power by the data scale.
+
+
+What:		/sys/bus/pci/devices/.../power_budget/power_budget_pm_state
+Date:		December 2024
+Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
+Description:
+		This file specifies the power management state of the operating
+		condition.
+
+What:		/sys/bus/pci/devices/.../power_budget/power_budget_pm_substate
+Date:		December 2024
+Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
+Description:
+		This file specifies the power management sub state of the
+		operating condition.
+
+What:		/sys/bus/pci/devices/.../power_budget/power_budget_type
+Date:		December 2024
+Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
+Description:
+		This file specifies the type of the operating condition.
+
+
+What:		/sys/bus/pci/devices/.../power_budget/power_budget_rail
+Date:		December 2024
+Contact:	Kobayashi Daisuke <kobayashi.da-06@fujitsu.com>
+Description:
+		This file Specifies the thermal load or power rail of the
+		operating condition.
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 5d0f4db1cab7..89909633ad02 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -238,6 +238,155 @@  static ssize_t current_link_width_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(current_link_width);
 
+static ssize_t power_budget_data_select_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	int pos;
+	u8 val;
+
+	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);
+	if (!pos)
+		return -EINVAL;
+
+	if (kstrtou8(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	pci_write_config_byte(pci_dev, pos + PCI_PWR_DSR, val);
+
+	return count;
+}
+
+static ssize_t power_budget_data_select_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	int pos, err;
+	u8 data;
+
+	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);
+	if (!pos)
+		return -EINVAL;
+
+	err = pci_read_config_byte(pci_dev, pos + PCI_PWR_DSR, &data);
+	if (err)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%u\n", data);
+}
+
+static DEVICE_ATTR_RW(power_budget_data_select);
+
+static ssize_t power_budget_power_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	int pos, err;
+	u32 data;
+	u8 base, scale_up, scale_low, scale;
+
+	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);
+	if (!pos)
+		return -EINVAL;
+
+	err = pci_read_config_dword(pci_dev, pos + PCI_PWR_DATA, &data);
+	if (err)
+		return -EINVAL;
+
+	base = PCI_PWR_DATA_BASE(data);
+	scale_up = PCI_PWR_DATA_SCALE_UP(data);
+	scale_low = PCI_PWR_DATA_SCALE(data);
+	scale = scale_up << 2 | scale_low;
+	if (scale == 0 && base >= 0xF0)
+		return sysfs_emit(buf, "%s\n",pci_power_budget_alt_encode_string(data));
+
+	return sysfs_emit(buf, "%u%s\n", base, pci_power_budget_scale_string(scale));
+}
+static DEVICE_ATTR_RO(power_budget_power);
+
+static ssize_t power_budget_pm_state_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	int pos, err;
+	u32 data;
+
+	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);
+	if (!pos)
+		return -EINVAL;
+
+	err = pci_read_config_dword(pci_dev, pos + PCI_PWR_DATA, &data);
+	if (err)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "D%u\n", PCI_PWR_DATA_PM_STATE(data));
+}
+static DEVICE_ATTR_RO(power_budget_pm_state);
+
+static ssize_t power_budget_pm_substate_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	int pos, err;
+	u8 substate;
+	u32 data;
+
+	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);
+	if (!pos)
+		return -EINVAL;
+
+	err = pci_read_config_dword(pci_dev, pos + PCI_PWR_DATA, &data);
+	if (err)
+		return -EINVAL;
+	
+	substate = PCI_PWR_DATA_PM_SUB(data);
+	if (substate == 0)
+		return sysfs_emit(buf, "Default Sub State\n");
+
+	return sysfs_emit(buf, "Device Specific Sub State\n");
+}
+static DEVICE_ATTR_RO(power_budget_pm_substate);
+
+static ssize_t power_budget_type_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	int pos, err;
+	u32 data;
+
+	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);
+	if (!pos)
+		return -EINVAL;
+
+	err = pci_read_config_dword(pci_dev, pos + PCI_PWR_DATA, &data);
+	if (err)
+		return -EINVAL;
+	
+	return sysfs_emit(buf, "%u\n", PCI_PWR_DATA_TYPE(data));
+}
+static DEVICE_ATTR_RO(power_budget_type);
+
+static ssize_t power_budget_rail_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	int pos, err;
+	u32 data;
+
+	pos = pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_PWR);
+	if (!pos)
+		return -EINVAL;
+
+	err = pci_read_config_dword(pci_dev, pos + PCI_PWR_DATA, &data);
+	if (err)
+		return -EINVAL;
+
+	return sysfs_emit(buf, "%s\n", 
+				pci_power_budget_rail_string(PCI_PWR_DATA_RAIL(data)));
+}
+static DEVICE_ATTR_RO(power_budget_rail);
+
 static ssize_t secondary_bus_number_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -636,6 +785,16 @@  static struct attribute *pcie_dev_attrs[] = {
 	NULL,
 };
 
+static struct attribute *pcie_pbec_attrs[] = {
+	&dev_attr_power_budget_data_select.attr,
+	&dev_attr_power_budget_power.attr,
+	&dev_attr_power_budget_pm_state.attr,
+	&dev_attr_power_budget_pm_substate.attr,
+	&dev_attr_power_budget_rail.attr,
+	&dev_attr_power_budget_type.attr,
+	NULL,
+};
+
 static struct attribute *pcibus_attrs[] = {
 	&dev_attr_bus_rescan.attr,
 	&dev_attr_cpuaffinity.attr,
@@ -1610,6 +1769,19 @@  static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj,
 	return 0;
 }
 
+static umode_t pcie_pbec_attrs_are_visible(struct kobject *kobj,
+					  struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (pci_is_pcie(pdev) && 
+		pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PWR))
+		return a->mode;
+
+	return 0;
+}
+
 static const struct attribute_group pci_dev_group = {
 	.attrs = pci_dev_attrs,
 };
@@ -1652,6 +1824,12 @@  static const struct attribute_group pcie_dev_attr_group = {
 	.is_visible = pcie_dev_attrs_are_visible,
 };
 
+static const struct attribute_group pcie_pbec_attr_group = {
+	.name = "power_budget",
+	.attrs = pcie_pbec_attrs,
+	.is_visible = pcie_pbec_attrs_are_visible,
+};
+
 const struct attribute_group *pci_dev_attr_groups[] = {
 	&pci_dev_attr_group,
 	&pci_dev_hp_attr_group,
@@ -1661,6 +1839,7 @@  const struct attribute_group *pci_dev_attr_groups[] = {
 #endif
 	&pci_bridge_attr_group,
 	&pcie_dev_attr_group,
+	&pcie_pbec_attr_group,
 #ifdef CONFIG_PCIEAER
 	&aer_stats_attr_group,
 #endif