Message ID | 20231013034158.2745127-2-alistair.francis@wdc.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | [v9,1/3] PCI/DOE: Rename DOE protocol to feature | expand |
On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote: > +#ifdef CONFIG_SYSFS > +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj, > + struct attribute *a, int n) > +{ > + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); > + struct pci_doe_mb *doe_mb; > + unsigned long index, j; > + void *entry; > + > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > + xa_for_each(&doe_mb->feats, j, entry) > + return a->mode; > + } > + > + return 0; > +} Out of curiosity: Does this method prevent creation of a "doe_features" directory for devices which don't have any DOE mailboxes? (If it does, a code comment explaining that might be helpful.) > +const struct attribute_group pci_dev_doe_feature_group = { > + .name = "doe_features", > + .attrs = pci_dev_doe_feature_attrs, > + .is_visible = pci_doe_sysfs_attr_is_visible, > +}; Nit: Wondering why the "=" is aligned for .name and .attrs but not for .is_visible? > +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev, > + struct pci_doe_mb *doe_mb) > +{ > + struct device *dev = &pdev->dev; > + struct device_attribute *attrs = doe_mb->sysfs_attrs; > + unsigned long i; > + void *entry; Nit: Inverse Christmas tree? > +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev, > + struct pci_doe_mb *doe_mb) > +{ > + struct device *dev = &pdev->dev; > + struct device_attribute *attrs; > + unsigned long num_features = 0; > + unsigned long vid, type; > + unsigned long i; > + void *entry; > + int ret; > + > + xa_for_each(&doe_mb->feats, i, entry) > + num_features++; > + > + attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL); > + if (!attrs) > + return -ENOMEM; > + > + doe_mb->sysfs_attrs = attrs; > + xa_for_each(&doe_mb->feats, i, entry) { > + sysfs_attr_init(&attrs[i].attr); > + vid = xa_to_value(entry) >> 8; > + type = xa_to_value(entry) & 0xFF; > + attrs[i].attr.name = kasprintf(GFP_KERNEL, > + "0x%04lX:%02lX", vid, type); Nit: Capital X conversion specifier will result in upper case hex characters in filename and contents, whereas existing sysfs attributes such as "vendor", "device" contain lower case hex characters. Might want to consider lower case x for consistency. > +void pci_doe_sysfs_teardown(struct pci_dev *pdev) > +{ > + struct pci_doe_mb *doe_mb; > + unsigned long index; > + > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > + pci_doe_sysfs_feature_remove(pdev, doe_mb); > + } Nit: Curly braces not necessary. > @@ -1153,6 +1154,10 @@ static void pci_remove_resource_files(struct pci_dev *pdev) > { > int i; > > + if (IS_ENABLED(CONFIG_PCI_DOE)) { > + pci_doe_sysfs_teardown(pdev); > + } Nit: Curly braces not necessary. > @@ -1230,6 +1235,12 @@ static int pci_create_resource_files(struct pci_dev *pdev) > int i; > int retval; > > + if (IS_ENABLED(CONFIG_PCI_DOE)) { > + retval = pci_doe_sysfs_init(pdev); > + if (retval) > + return retval; > + } > + > /* Expose the PCI resources from this device as files */ > for (i = 0; i < PCI_STD_NUM_BARS; i++) { I think this needs to be added to pci_create_sysfs_dev_files() instead. pci_create_resource_files() only deals with creation of resource files, as the name implies, which is unrelated to DOE features. Worse, pci_create_resource_files() is also called from pci_dev_resource_resize_attr(), i.e. every time user space writes to the "resource##n##_resize" attributes. Similarly, the call to pci_doe_sysfs_teardown() belongs in pci_remove_sysfs_dev_files(). Thanks, Lukas
On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote: > The PCIe 6 specification added support for the Data Object Exchange (DOE). > When DOE is supported the DOE Discovery Feature must be implemented per > PCIe r6.1 sec 6.30.1.1. The protocol allows a requester to obtain > information about the other DOE features supported by the device. > ... > +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj, > + struct attribute *a, int n) > +{ > + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); > + struct pci_doe_mb *doe_mb; > + unsigned long index, j; > + void *entry; > + > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > + xa_for_each(&doe_mb->feats, j, entry) > + return a->mode; > + } > + > + return 0; The nested loops that don't test anything look a little weird and maybe I'm missing something, but this looks like it returns a->mode if any mailbox with a feature exists, and 0 otherwise. Is that the same as this: if (pdev->doe_mbs) return a->mode; return 0; since it sounds like a mailbox must support at least one feature? > +} > + > +static struct attribute *pci_dev_doe_feature_attrs[] = { > + NULL, > +}; > + > +const struct attribute_group pci_dev_doe_feature_group = { > + .name = "doe_features", > + .attrs = pci_dev_doe_feature_attrs, > + .is_visible = pci_doe_sysfs_attr_is_visible, > +}; > + > +static ssize_t pci_doe_sysfs_feature_show(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + return sysfs_emit(buf, "%s\n", attr->attr.name); > +} > + > +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev, > + struct pci_doe_mb *doe_mb) > +{ > + struct device *dev = &pdev->dev; > + struct device_attribute *attrs = doe_mb->sysfs_attrs; > + unsigned long i; > + void *entry; > + > + if (!attrs) > + return; > + > + doe_mb->sysfs_attrs = NULL; > + xa_for_each(&doe_mb->feats, i, entry) { > + if (attrs[i].show) > + sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, > + pci_dev_doe_feature_group.name); > + kfree(attrs[i].attr.name); > + } > + kfree(attrs); > +} > + > +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev, > + struct pci_doe_mb *doe_mb) > +{ > + struct device *dev = &pdev->dev; > + struct device_attribute *attrs; > + unsigned long num_features = 0; > + unsigned long vid, type; > + unsigned long i; > + void *entry; > + int ret; > + > + xa_for_each(&doe_mb->feats, i, entry) > + num_features++; > + > + attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL); > + if (!attrs) > + return -ENOMEM; > + > + doe_mb->sysfs_attrs = attrs; > + xa_for_each(&doe_mb->feats, i, entry) { > + sysfs_attr_init(&attrs[i].attr); > + vid = xa_to_value(entry) >> 8; > + type = xa_to_value(entry) & 0xFF; > + attrs[i].attr.name = kasprintf(GFP_KERNEL, > + "0x%04lX:%02lX", vid, type); What's the rationale for using "0x" on the vendor ID but not on the type? "0x1234:10" hints that the "10" might be decimal since it lacks "0x". Suggest lower-case "%04lx:%02lx" either way. FWIW, there's no "0x" prefix on the hex vendor IDs in "lspci -n" output and dmesg messages like this: pci 0000:01:00.0: [10de:13b6] type 00 > + if (!attrs[i].attr.name) { > + ret = -ENOMEM; > + goto fail; > + } > + > + attrs[i].attr.mode = 0444; > + attrs[i].show = pci_doe_sysfs_feature_show; > + > + ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, > + pci_dev_doe_feature_group.name); > + if (ret) { > + attrs[i].show = NULL; > + goto fail; > + } > + } > + > + return 0; > + > +fail: > + pci_doe_sysfs_feature_remove(pdev, doe_mb); > + return ret; Not sure we should treat this failure this seriously. Looks like it will prevent creation of the BAR resource files, and possibly even abort pci_sysfs_init() early. I think the pci_dev will still be usable (lacking DOE sysfs) even if this fails. > +} > + > +void pci_doe_sysfs_teardown(struct pci_dev *pdev) > +{ > + struct pci_doe_mb *doe_mb; > + unsigned long index; > + > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > + pci_doe_sysfs_feature_remove(pdev, doe_mb); > + } > +} > + > +int pci_doe_sysfs_init(struct pci_dev *pdev) > +{ > + struct pci_doe_mb *doe_mb; > + unsigned long index; > + int ret; > + > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > + ret = pci_doe_sysfs_feature_populate(pdev, doe_mb); > + if (ret) > + return ret; > + } I agree with Lukas that pci_create_resource_files() is not the right place to call this. I try hard to avoid calling *anything* from the pci_create_sysfs_dev_files() path because it has the nasty "sysfs_initialized" check and the associated pci_sysfs_init() initcall. It's so much cleaner when we can set up static attributes that are automatically added in the device_add() path. I don't know whether that's possible. I see lots of discussion with Greg KH that might be related, but I'm not sure. I do know that we enumerate the mailboxes and features during pci_init_capabilities(), which happens before device_add(), so the information about which attributes should be present is at least *available* early enough: pci_host_probe pci_scan_root_bus_bridge ... pci_scan_single_device pci_device_add pci_init_capabilities pci_doe_init while (pci_find_next_ext_capability(PCI_EXT_CAP_ID_DOE)) pci_doe_create_mb pci_doe_cache_features pci_doe_discovery xa_insert(&doe_mb->feats) <-- device_add device_add_attrs device_add_groups pci_bus_add_devices pci_bus_add_device pci_create_sysfs_dev_files ... pci_doe_sysfs_init <-- xa_for_each(&pdev->doe_mbs) pci_doe_sysfs_feature_populate xa_for_each(&doe_mb->feats) sysfs_add_file_to_group(pci_dev_doe_feature_group.name) Is it feasible to build an attribute group in pci_doe_init() and add it to dev->groups so device_add() will automatically add them? It looks like __power_supply_register() does something like that: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/power/supply/power_supply_core.c?id=v6.5#n1375 > --- a/include/linux/pci-doe.h > +++ b/include/linux/pci-doe.h > @@ -22,4 +22,6 @@ int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type, > const void *request, size_t request_sz, > void *response, size_t response_sz); > > +int pci_doe_sysfs_init(struct pci_dev *pci_dev); > +void pci_doe_sysfs_teardown(struct pci_dev *pdev); These declarations look like they should be in drivers/pci/pci.h as well. I don't think anything outside the PCI core should need these. Bjorn
On Thu, Oct 19, 2023 at 11:58:29AM -0500, Bjorn Helgaas wrote: > On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote: > > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > > + xa_for_each(&doe_mb->feats, j, entry) > > + return a->mode; > > + } > > + > > + return 0; > > The nested loops that don't test anything look a little weird and > maybe I'm missing something, but this looks like it returns a->mode if > any mailbox with a feature exists, and 0 otherwise. > > Is that the same as this: > > if (pdev->doe_mbs) > return a->mode; > > return 0; > > since it sounds like a mailbox must support at least one feature? In theory it's the same, in practice there *might* be non-compliant devices which lack support for the discovery feature. > > + attrs[i].attr.name = kasprintf(GFP_KERNEL, > > + "0x%04lX:%02lX", vid, type); > > What's the rationale for using "0x" on the vendor ID but not on the > type? "0x1234:10" hints that the "10" might be decimal since it lacks > "0x". > > Suggest lower-case "%04lx:%02lx" either way. > > FWIW, there's no "0x" prefix on the hex vendor IDs in "lspci -n" > output and dmesg messages like this: > > pci 0000:01:00.0: [10de:13b6] type 00 The existing attributes "vendor", "device" etc do emit the "0x". From drivers/pci/pci-sysfs.c: pci_config_attr(vendor, "0x%04x\n"); pci_config_attr(device, "0x%04x\n"); pci_config_attr(subsystem_vendor, "0x%04x\n"); pci_config_attr(subsystem_device, "0x%04x\n"); pci_config_attr(revision, "0x%02x\n"); pci_config_attr(class, "0x%06x\n"); > I try hard to avoid calling *anything* from the > pci_create_sysfs_dev_files() path because it has the nasty > "sysfs_initialized" check and the associated pci_sysfs_init() > initcall. What's the purpose of sysfs_initialized anyway? It was introduced by this historic commit: https://git.kernel.org/tglx/history/c/f6d553444da2 Can PCI_ROM_RESOURCEs appear after device enumeration but before the late_initcall stage? If sysfs_initialized is only needed for PCI_ROM_RESOURCEs, can we constrain pci_sysfs_init() to those and avoid creating all the other runtime sysfs attributes in the initcall? Thanks, Lukas
On Thu, Oct 19, 2023 at 09:32:46PM +0200, Lukas Wunner wrote: > On Thu, Oct 19, 2023 at 11:58:29AM -0500, Bjorn Helgaas wrote: > > On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote: > > > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > > > + xa_for_each(&doe_mb->feats, j, entry) > > > + return a->mode; > > > + } > > > + > > > + return 0; > > > > The nested loops that don't test anything look a little weird and > > maybe I'm missing something, but this looks like it returns a->mode if > > any mailbox with a feature exists, and 0 otherwise. > > > > Is that the same as this: > > > > if (pdev->doe_mbs) > > return a->mode; > > > > return 0; > > > > since it sounds like a mailbox must support at least one feature? > > In theory it's the same, in practice there *might* be non-compliant > devices which lack support for the discovery feature. Is there any point in setting ->doe_mbs if there's no feature? > > > + attrs[i].attr.name = kasprintf(GFP_KERNEL, > > > + "0x%04lX:%02lX", vid, type); > > > > What's the rationale for using "0x" on the vendor ID but not on the > > type? "0x1234:10" hints that the "10" might be decimal since it lacks > > "0x". This is my main question. Seems like it should be both or neither. > > I try hard to avoid calling *anything* from the > > pci_create_sysfs_dev_files() path because it has the nasty > > "sysfs_initialized" check and the associated pci_sysfs_init() > > initcall. > > What's the purpose of sysfs_initialized anyway? > > It was introduced by this historic commit: > https://git.kernel.org/tglx/history/c/f6d553444da2 > > Can PCI_ROM_RESOURCEs appear after device enumeration but before > the late_initcall stage? > > If sysfs_initialized is only needed for PCI_ROM_RESOURCEs, can we > constrain pci_sysfs_init() to those and avoid creating all the > other runtime sysfs attributes in the initcall? I think pci_sysfs_init() is already constrained to only the BARs and ROM. Constraining it to only the ROM would be an improvement, but I'd really like to get rid of it altogether. Krzysztof W. moved a lot of stuff out of pci_sysfs_init() a while ago, but the BARs are harder because of some arch/alpha wrinkles, IIRC. I think the reason for pci_sysfs_init() exists in the first place is because those resources may be assigned after pci_device_add(), and (my memory is hazy here) it seems like changing the size of binary attributes is hard, which might fit with the pci_remove_resource_files() and pci_create_resource_files() in the resource##n##_resize_store() macro: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/pci/pci-sysfs.c?id=v6.5#n1440 Bjorn
On Tue, Oct 17, 2023 at 6:34 PM Lukas Wunner <lukas@wunner.de> wrote: > > On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote: > > +#ifdef CONFIG_SYSFS > > +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj, > > + struct attribute *a, int n) > > +{ > > + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); > > + struct pci_doe_mb *doe_mb; > > + unsigned long index, j; > > + void *entry; > > + > > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > > + xa_for_each(&doe_mb->feats, j, entry) > > + return a->mode; > > + } > > + > > + return 0; > > +} > > Out of curiosity: Does this method prevent creation of a > "doe_features" directory for devices which don't have any > DOE mailboxes? It does once this patch (or something similar) is applied: https://lkml.org/lkml/2022/8/24/607 GKH and I are working on getting a patch like that working and merged Alistair > > (If it does, a code comment explaining that might be helpful.) > > > > +const struct attribute_group pci_dev_doe_feature_group = { > > + .name = "doe_features", > > + .attrs = pci_dev_doe_feature_attrs, > > + .is_visible = pci_doe_sysfs_attr_is_visible, > > +}; > > Nit: Wondering why the "=" is aligned for .name and .attrs > but not for .is_visible? > > > > +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev, > > + struct pci_doe_mb *doe_mb) > > +{ > > + struct device *dev = &pdev->dev; > > + struct device_attribute *attrs = doe_mb->sysfs_attrs; > > + unsigned long i; > > + void *entry; > > Nit: Inverse Christmas tree? > > > > +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev, > > + struct pci_doe_mb *doe_mb) > > +{ > > + struct device *dev = &pdev->dev; > > + struct device_attribute *attrs; > > + unsigned long num_features = 0; > > + unsigned long vid, type; > > + unsigned long i; > > + void *entry; > > + int ret; > > + > > + xa_for_each(&doe_mb->feats, i, entry) > > + num_features++; > > + > > + attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL); > > + if (!attrs) > > + return -ENOMEM; > > + > > + doe_mb->sysfs_attrs = attrs; > > + xa_for_each(&doe_mb->feats, i, entry) { > > + sysfs_attr_init(&attrs[i].attr); > > + vid = xa_to_value(entry) >> 8; > > + type = xa_to_value(entry) & 0xFF; > > + attrs[i].attr.name = kasprintf(GFP_KERNEL, > > + "0x%04lX:%02lX", vid, type); > > Nit: Capital X conversion specifier will result in upper case hex > characters in filename and contents, whereas existing sysfs attributes > such as "vendor", "device" contain lower case hex characters. > > Might want to consider lower case x for consistency. > > > > +void pci_doe_sysfs_teardown(struct pci_dev *pdev) > > +{ > > + struct pci_doe_mb *doe_mb; > > + unsigned long index; > > + > > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > > + pci_doe_sysfs_feature_remove(pdev, doe_mb); > > + } > > Nit: Curly braces not necessary. > > > > @@ -1153,6 +1154,10 @@ static void pci_remove_resource_files(struct pci_dev *pdev) > > { > > int i; > > > > + if (IS_ENABLED(CONFIG_PCI_DOE)) { > > + pci_doe_sysfs_teardown(pdev); > > + } > > Nit: Curly braces not necessary. > > > > @@ -1230,6 +1235,12 @@ static int pci_create_resource_files(struct pci_dev *pdev) > > int i; > > int retval; > > > > + if (IS_ENABLED(CONFIG_PCI_DOE)) { > > + retval = pci_doe_sysfs_init(pdev); > > + if (retval) > > + return retval; > > + } > > + > > /* Expose the PCI resources from this device as files */ > > for (i = 0; i < PCI_STD_NUM_BARS; i++) { > > I think this needs to be added to pci_create_sysfs_dev_files() instead. > > pci_create_resource_files() only deals with creation of resource files, > as the name implies, which is unrelated to DOE features. > > Worse, pci_create_resource_files() is also called from > pci_dev_resource_resize_attr(), i.e. every time user space > writes to the "resource##n##_resize" attributes. > > Similarly, the call to pci_doe_sysfs_teardown() belongs in > pci_remove_sysfs_dev_files(). > > Thanks, > > Lukas
On Fri, Oct 20, 2023 at 2:58 AM Bjorn Helgaas <helgaas@kernel.org> wrote: > > On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote: > > The PCIe 6 specification added support for the Data Object Exchange (DOE). > > When DOE is supported the DOE Discovery Feature must be implemented per > > PCIe r6.1 sec 6.30.1.1. The protocol allows a requester to obtain > > information about the other DOE features supported by the device. > > ... > > > +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj, > > + struct attribute *a, int n) > > +{ > > + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); > > + struct pci_doe_mb *doe_mb; > > + unsigned long index, j; > > + void *entry; > > + > > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > > + xa_for_each(&doe_mb->feats, j, entry) > > + return a->mode; > > + } > > + > > + return 0; > > The nested loops that don't test anything look a little weird and > maybe I'm missing something, but this looks like it returns a->mode if > any mailbox with a feature exists, and 0 otherwise. > > Is that the same as this: > > if (pdev->doe_mbs) > return a->mode; > > return 0; > > since it sounds like a mailbox must support at least one feature? I don't think this is the exact same. pdev->doe_mbs exist (created by xa_init()) even if there are no features supported. I do think it's important we make sure DOE features exist before we show the property. > > > +} > > + > > +static struct attribute *pci_dev_doe_feature_attrs[] = { > > + NULL, > > +}; > > + > > +const struct attribute_group pci_dev_doe_feature_group = { > > + .name = "doe_features", > > + .attrs = pci_dev_doe_feature_attrs, > > + .is_visible = pci_doe_sysfs_attr_is_visible, > > +}; > > + > > +static ssize_t pci_doe_sysfs_feature_show(struct device *dev, > > + struct device_attribute *attr, > > + char *buf) > > +{ > > + return sysfs_emit(buf, "%s\n", attr->attr.name); > > +} > > + > > +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev, > > + struct pci_doe_mb *doe_mb) > > +{ > > + struct device *dev = &pdev->dev; > > + struct device_attribute *attrs = doe_mb->sysfs_attrs; > > + unsigned long i; > > + void *entry; > > + > > + if (!attrs) > > + return; > > + > > + doe_mb->sysfs_attrs = NULL; > > + xa_for_each(&doe_mb->feats, i, entry) { > > + if (attrs[i].show) > > + sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, > > + pci_dev_doe_feature_group.name); > > + kfree(attrs[i].attr.name); > > + } > > + kfree(attrs); > > +} > > + > > +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev, > > + struct pci_doe_mb *doe_mb) > > +{ > > + struct device *dev = &pdev->dev; > > + struct device_attribute *attrs; > > + unsigned long num_features = 0; > > + unsigned long vid, type; > > + unsigned long i; > > + void *entry; > > + int ret; > > + > > + xa_for_each(&doe_mb->feats, i, entry) > > + num_features++; > > + > > + attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL); > > + if (!attrs) > > + return -ENOMEM; > > + > > + doe_mb->sysfs_attrs = attrs; > > + xa_for_each(&doe_mb->feats, i, entry) { > > + sysfs_attr_init(&attrs[i].attr); > > + vid = xa_to_value(entry) >> 8; > > + type = xa_to_value(entry) & 0xFF; > > + attrs[i].attr.name = kasprintf(GFP_KERNEL, > > + "0x%04lX:%02lX", vid, type); > > What's the rationale for using "0x" on the vendor ID but not on the > type? "0x1234:10" hints that the "10" might be decimal since it lacks > "0x". > > Suggest lower-case "%04lx:%02lx" either way. Fixed! > > FWIW, there's no "0x" prefix on the hex vendor IDs in "lspci -n" > output and dmesg messages like this: > > pci 0000:01:00.0: [10de:13b6] type 00 > > > + if (!attrs[i].attr.name) { > > + ret = -ENOMEM; > > + goto fail; > > + } > > + > > + attrs[i].attr.mode = 0444; > > + attrs[i].show = pci_doe_sysfs_feature_show; > > + > > + ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, > > + pci_dev_doe_feature_group.name); > > + if (ret) { > > + attrs[i].show = NULL; > > + goto fail; > > + } > > + } > > + > > + return 0; > > + > > +fail: > > + pci_doe_sysfs_feature_remove(pdev, doe_mb); > > + return ret; > > Not sure we should treat this failure this seriously. Looks like it > will prevent creation of the BAR resource files, and possibly even > abort pci_sysfs_init() early. I think the pci_dev will still be > usable (lacking DOE sysfs) even if this fails. I can change the call in pci_create_resource_files() to not return? > > > +} > > + > > +void pci_doe_sysfs_teardown(struct pci_dev *pdev) > > +{ > > + struct pci_doe_mb *doe_mb; > > + unsigned long index; > > + > > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > > + pci_doe_sysfs_feature_remove(pdev, doe_mb); > > + } > > +} > > + > > +int pci_doe_sysfs_init(struct pci_dev *pdev) > > +{ > > + struct pci_doe_mb *doe_mb; > > + unsigned long index; > > + int ret; > > + > > + xa_for_each(&pdev->doe_mbs, index, doe_mb) { > > + ret = pci_doe_sysfs_feature_populate(pdev, doe_mb); > > + if (ret) > > + return ret; > > + } > > I agree with Lukas that pci_create_resource_files() is not the right > place to call this. > > I try hard to avoid calling *anything* from the > pci_create_sysfs_dev_files() path because it has the nasty > "sysfs_initialized" check and the associated pci_sysfs_init() > initcall. > > It's so much cleaner when we can set up static attributes that are > automatically added in the device_add() path. I don't know whether > that's possible. I see lots of discussion with Greg KH that might be > related, but I'm not sure. I don't think it's possible, at least not that I or anyone else has been able to figure out yet. > > I do know that we enumerate the mailboxes and features during > pci_init_capabilities(), which happens before device_add(), so the > information about which attributes should be present is at least > *available* early enough: > > pci_host_probe > pci_scan_root_bus_bridge > ... > pci_scan_single_device > pci_device_add > pci_init_capabilities > pci_doe_init > while (pci_find_next_ext_capability(PCI_EXT_CAP_ID_DOE)) > pci_doe_create_mb > pci_doe_cache_features > pci_doe_discovery > xa_insert(&doe_mb->feats) <-- > device_add > device_add_attrs > device_add_groups > pci_bus_add_devices > pci_bus_add_device > pci_create_sysfs_dev_files > ... > pci_doe_sysfs_init <-- > xa_for_each(&pdev->doe_mbs) > pci_doe_sysfs_feature_populate > xa_for_each(&doe_mb->feats) > sysfs_add_file_to_group(pci_dev_doe_feature_group.name) > > Is it feasible to build an attribute group in pci_doe_init() and add > it to dev->groups so device_add() will automatically add them? That doesn't work as the sysfs_add_file_to_group() function will seg fault when trying to find the parent as I don't think it exists yet. [ 0.767581] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 0.767835] #PF: supervisor read access in kernel mode [ 0.767835] #PF: error_code(0x0000) - not-present page [ 0.767835] PGD 0 P4D 0 [ 0.767835] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 0.767835] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.6.0-10270-g5dda351a02c8-dirty #10 [ 0.767835] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-14-g1e1da7a96300-prebuilt.qemu.org 04/01/2014 [ 0.767835] RIP: 0010:kernfs_find_and_get_ns+0x10/0x70 [ 0.767835] Code: 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 41 55 49 89 d5 41 54 49 89 f4 55 53 <48> 8b 0 [ 0.767835] RSP: 0018:ffff96f9c00138a8 EFLAGS: 00000246 [ 0.767835] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000030 [ 0.767835] RDX: 0000000000000000 RSI: ffffffffafec53d9 RDI: 0000000000000000 [ 0.767835] RBP: ffff957b4180e0b8 R08: 0000000000000000 R09: 0000000000ffff10 [ 0.767835] R10: 0000000000000000 R11: ffffffffaf677c80 R12: ffffffffafec53d9 [ 0.767835] R13: 0000000000000000 R14: ffff957b413c1ea0 R15: ffff957b4180e000 [ 0.767835] FS: 0000000000000000(0000) GS:ffff957bbdc00000(0000) knlGS:0000000000000000 [ 0.767835] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.767835] CR2: 0000000000000008 CR3: 000000004442e000 CR4: 00000000000006f0 [ 0.767835] Call Trace: [ 0.767835] <TASK> [ 0.767835] ? __die+0x1e/0x60 [ 0.767835] ? page_fault_oops+0x17c/0x470 [ 0.767835] ? search_module_extables+0x14/0x50 [ 0.767835] ? exc_page_fault+0x67/0x150 [ 0.767835] ? asm_exc_page_fault+0x26/0x30 [ 0.767835] ? __pfx_pci_mmcfg_read+0x10/0x10 [ 0.767835] ? kernfs_find_and_get_ns+0x10/0x70 [ 0.767835] ? kasprintf+0x5a/0x80 [ 0.767835] sysfs_add_file_to_group+0x4c/0x110 [ 0.767835] pci_doe_sysfs_init+0x13b/0x240 [ 0.767835] pci_device_add+0x1d7/0x620 [ 0.767835] pci_scan_single_device+0xc8/0x100 [ 0.767835] pci_scan_slot+0x6f/0x1e0 [ 0.767835] pci_scan_child_bus_extend+0x30/0x210 [ 0.767835] pci_scan_bridge_extend+0x5f4/0x710 [ 0.767835] pci_scan_child_bus_extend+0xc2/0x210 [ 0.767835] acpi_pci_root_create+0x283/0x2f0 [ 0.767835] pci_acpi_scan_root+0x199/0x200 [ 0.767835] acpi_pci_root_add+0x1ba/0x370 [ 0.767835] acpi_bus_attach+0x140/0x260 [ 0.767835] ? __pfx_acpi_dev_for_one_check+0x10/0x10 [ 0.767835] device_for_each_child+0x68/0xa0 [ 0.767835] acpi_dev_for_each_child+0x37/0x60 [ 0.767835] ? __pfx_acpi_bus_attach+0x10/0x10 [ 0.767835] acpi_bus_attach+0x21e/0x260 [ 0.767835] ? __pfx_acpi_dev_for_one_check+0x10/0x10 [ 0.767835] device_for_each_child+0x68/0xa0 [ 0.767835] acpi_dev_for_each_child+0x37/0x60 [ 0.767835] ? __pfx_acpi_bus_attach+0x10/0x10 [ 0.767835] acpi_bus_attach+0x21e/0x260 [ 0.767835] acpi_bus_scan+0x6b/0x1e0 [ 0.767835] acpi_scan_init+0xdc/0x290 [ 0.767835] acpi_init+0x22b/0x500 [ 0.767835] ? __pfx_acpi_init+0x10/0x10 [ 0.767835] do_one_initcall+0x56/0x220 [ 0.767835] kernel_init_freeable+0x19e/0x2d0 [ 0.767835] ? __pfx_kernel_init+0x10/0x10 [ 0.767835] kernel_init+0x15/0x1b0 [ 0.767835] ret_from_fork+0x2f/0x50 [ 0.767835] ? __pfx_kernel_init+0x10/0x10 [ 0.767835] ret_from_fork_asm+0x1b/0x30 I can move this to pci_create_sysfs_dev_files() instead if that's at least better? > > It looks like __power_supply_register() does something like that: > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/power/supply/power_supply_core.c?id=v6.5#n1375 > > > --- a/include/linux/pci-doe.h > > +++ b/include/linux/pci-doe.h > > @@ -22,4 +22,6 @@ int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type, > > const void *request, size_t request_sz, > > void *response, size_t response_sz); > > > > +int pci_doe_sysfs_init(struct pci_dev *pci_dev); > > +void pci_doe_sysfs_teardown(struct pci_dev *pdev); > > These declarations look like they should be in drivers/pci/pci.h as > well. I don't think anything outside the PCI core should need these. I will move these. Alistair > > Bjorn
On Thu, Oct 19, 2023 at 11:58:29AM -0500, Bjorn Helgaas wrote: > Is it feasible to build an attribute group in pci_doe_init() and add > it to dev->groups so device_add() will automatically add them? Note that pcibios_device_add() in arch/s390/pci/pci.c does this: pdev->dev.groups = zpci_attr_groups; ... which prevents usage of pdev->dev.groups for anything else. This needs to be cleaned up first before the PCI core can allocate and fill generic attribute groups on enumeration. Thanks, Lukas
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index ecf47559f495..66d9d66e3584 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -500,3 +500,27 @@ Description: console drivers from the device. Raw users of pci-sysfs resourceN attributes must be terminated prior to resizing. Success of the resizing operation is not guaranteed. + +What: /sys/bus/pci/devices/.../doe_features +Date: October 2023 +Contact: Linux PCI developers <linux-pci@vger.kernel.org> +Description: + This directory contains a list of the supported + Data Object Exchange (DOE) features. The feature values are in + the file name. The contents of each file are the same as the + name. + + The value comes from the device and specifies the vendor and + data object type supported. The lower (RHS of the colon) is + the data object type in hex. The upper (LHS of the colon) + is the vendor ID. + + As all DOE devices must support the DOE discovery protocol, if + DOE is supported you will at least see this file, with this + contents + + # cat doe_features/0x0001:00 + 0x0001:00 + + If the device supports other protocols you will see other files + as well. diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c index db2a86edf2e1..1ffc5441e6d1 100644 --- a/drivers/pci/doe.c +++ b/drivers/pci/doe.c @@ -47,6 +47,7 @@ * @wq: Wait queue for work item * @work_queue: Queue of pci_doe_work items * @flags: Bit array of PCI_DOE_FLAG_* flags + * @sysfs_attrs: Array of sysfs device attributes */ struct pci_doe_mb { struct pci_dev *pdev; @@ -56,6 +57,10 @@ struct pci_doe_mb { wait_queue_head_t wq; struct workqueue_struct *work_queue; unsigned long flags; + +#ifdef CONFIG_SYSFS + struct device_attribute *sysfs_attrs; +#endif }; struct pci_doe_feature { @@ -92,6 +97,135 @@ struct pci_doe_task { struct pci_doe_mb *doe_mb; }; +#ifdef CONFIG_SYSFS +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + struct pci_doe_mb *doe_mb; + unsigned long index, j; + void *entry; + + xa_for_each(&pdev->doe_mbs, index, doe_mb) { + xa_for_each(&doe_mb->feats, j, entry) + return a->mode; + } + + return 0; +} + +static struct attribute *pci_dev_doe_feature_attrs[] = { + NULL, +}; + +const struct attribute_group pci_dev_doe_feature_group = { + .name = "doe_features", + .attrs = pci_dev_doe_feature_attrs, + .is_visible = pci_doe_sysfs_attr_is_visible, +}; + +static ssize_t pci_doe_sysfs_feature_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%s\n", attr->attr.name); +} + +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev, + struct pci_doe_mb *doe_mb) +{ + struct device *dev = &pdev->dev; + struct device_attribute *attrs = doe_mb->sysfs_attrs; + unsigned long i; + void *entry; + + if (!attrs) + return; + + doe_mb->sysfs_attrs = NULL; + xa_for_each(&doe_mb->feats, i, entry) { + if (attrs[i].show) + sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, + pci_dev_doe_feature_group.name); + kfree(attrs[i].attr.name); + } + kfree(attrs); +} + +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev, + struct pci_doe_mb *doe_mb) +{ + struct device *dev = &pdev->dev; + struct device_attribute *attrs; + unsigned long num_features = 0; + unsigned long vid, type; + unsigned long i; + void *entry; + int ret; + + xa_for_each(&doe_mb->feats, i, entry) + num_features++; + + attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return -ENOMEM; + + doe_mb->sysfs_attrs = attrs; + xa_for_each(&doe_mb->feats, i, entry) { + sysfs_attr_init(&attrs[i].attr); + vid = xa_to_value(entry) >> 8; + type = xa_to_value(entry) & 0xFF; + attrs[i].attr.name = kasprintf(GFP_KERNEL, + "0x%04lX:%02lX", vid, type); + if (!attrs[i].attr.name) { + ret = -ENOMEM; + goto fail; + } + + attrs[i].attr.mode = 0444; + attrs[i].show = pci_doe_sysfs_feature_show; + + ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, + pci_dev_doe_feature_group.name); + if (ret) { + attrs[i].show = NULL; + goto fail; + } + } + + return 0; + +fail: + pci_doe_sysfs_feature_remove(pdev, doe_mb); + return ret; +} + +void pci_doe_sysfs_teardown(struct pci_dev *pdev) +{ + struct pci_doe_mb *doe_mb; + unsigned long index; + + xa_for_each(&pdev->doe_mbs, index, doe_mb) { + pci_doe_sysfs_feature_remove(pdev, doe_mb); + } +} + +int pci_doe_sysfs_init(struct pci_dev *pdev) +{ + struct pci_doe_mb *doe_mb; + unsigned long index; + int ret; + + xa_for_each(&pdev->doe_mbs, index, doe_mb) { + ret = pci_doe_sysfs_feature_populate(pdev, doe_mb); + if (ret) + return ret; + } + + return 0; +} +#endif + static int pci_doe_wait(struct pci_doe_mb *doe_mb, unsigned long timeout) { if (wait_event_timeout(doe_mb->wq, diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index d9eede2dbc0e..64bf765df5e2 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -16,6 +16,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/pci.h> +#include <linux/pci-doe.h> #include <linux/stat.h> #include <linux/export.h> #include <linux/topology.h> @@ -1153,6 +1154,10 @@ static void pci_remove_resource_files(struct pci_dev *pdev) { int i; + if (IS_ENABLED(CONFIG_PCI_DOE)) { + pci_doe_sysfs_teardown(pdev); + } + for (i = 0; i < PCI_STD_NUM_BARS; i++) { struct bin_attribute *res_attr; @@ -1230,6 +1235,12 @@ static int pci_create_resource_files(struct pci_dev *pdev) int i; int retval; + if (IS_ENABLED(CONFIG_PCI_DOE)) { + retval = pci_doe_sysfs_init(pdev); + if (retval) + return retval; + } + /* Expose the PCI resources from this device as files */ for (i = 0; i < PCI_STD_NUM_BARS; i++) { @@ -1655,6 +1666,9 @@ static const struct attribute_group *pci_dev_attr_groups[] = { #endif #ifdef CONFIG_PCIEASPM &aspm_ctrl_attr_group, +#endif +#ifdef CONFIG_PCI_DOE + &pci_dev_doe_feature_group, #endif NULL, }; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 39a8932dc340..83065e07c491 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -186,6 +186,7 @@ extern const struct attribute_group *pci_dev_groups[]; extern const struct attribute_group *pcibus_groups[]; extern const struct device_type pci_dev_type; extern const struct attribute_group *pci_bus_groups[]; +extern const struct attribute_group pci_dev_doe_feature_group; extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mmio_size; diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h index 1f14aed4354b..c5529ae604ff 100644 --- a/include/linux/pci-doe.h +++ b/include/linux/pci-doe.h @@ -22,4 +22,6 @@ int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type, const void *request, size_t request_sz, void *response, size_t response_sz); +int pci_doe_sysfs_init(struct pci_dev *pci_dev); +void pci_doe_sysfs_teardown(struct pci_dev *pdev); #endif