Message ID | 20250106121017.1620-2-shiju.jose@huawei.com (mailing list archive) |
---|---|
State | Handled Elsewhere, archived |
Headers | show |
Series | EDAC: Scrub: introduce generic EDAC RAS control feature driver + CXL/ACPI-RAS2 drivers | expand |
On Mon, Jan 06, 2025 at 12:09:57PM +0000, shiju.jose@huawei.com wrote: > +int edac_dev_register(struct device *parent, char *name, > + void *private, int num_features, > + const struct edac_dev_feature *ras_features) > +{ > + const struct attribute_group **ras_attr_groups; > + struct edac_dev_feat_ctx *ctx; > + int attr_gcnt = 0; > + int ret, feat; > + > + if (!parent || !name || !num_features || !ras_features) > + return -EINVAL; > + > + /* Double parse to make space for attributes */ > + for (feat = 0; feat < num_features; feat++) { > + switch (ras_features[feat].ft_type) { > + /* Add feature specific code */ > + default: > + return -EINVAL; > + } > + } > + > + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); > + if (!ctx) > + return -ENOMEM; > + > + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL); > + if (!ras_attr_groups) { > + ret = -ENOMEM; > + goto ctx_free; > + } > + > + attr_gcnt = 0; > + for (feat = 0; feat < num_features; feat++, ras_features++) { > + switch (ras_features->ft_type) { > + /* Add feature specific code */ > + default: > + ret = -EINVAL; > + goto groups_free; > + } > + } > + > + ctx->dev.parent = parent; > + ctx->dev.bus = edac_get_sysfs_subsys(); > + ctx->dev.type = &edac_dev_type; > + ctx->dev.groups = ras_attr_groups; > + ctx->private = private; > + dev_set_drvdata(&ctx->dev, ctx); > + > + ret = dev_set_name(&ctx->dev, name); > + if (ret) > + goto groups_free; > + > + ret = device_register(&ctx->dev); > + if (ret) { > + put_device(&ctx->dev); > + return ret; Who is freeing ctx and ras_attr_groups when you return here?
>-----Original Message----- >From: Borislav Petkov <bp@alien8.de> >Sent: 06 January 2025 13:38 >To: Shiju Jose <shiju.jose@huawei.com> >Cc: linux-edac@vger.kernel.org; linux-cxl@vger.kernel.org; linux- >acpi@vger.kernel.org; linux-mm@kvack.org; linux-kernel@vger.kernel.org; >tony.luck@intel.com; rafael@kernel.org; lenb@kernel.org; >mchehab@kernel.org; dan.j.williams@intel.com; dave@stgolabs.net; Jonathan >Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com; >alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com; >david@redhat.com; Vilas.Sridharan@amd.com; leo.duran@amd.com; >Yazen.Ghannam@amd.com; rientjes@google.com; jiaqiyan@google.com; >Jon.Grimm@amd.com; dave.hansen@linux.intel.com; >naoya.horiguchi@nec.com; james.morse@arm.com; jthoughton@google.com; >somasundaram.a@hpe.com; erdemaktas@google.com; pgonda@google.com; >duenwen@google.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; Roberto >Sassu <roberto.sassu@huawei.com>; kangkang.shen@futurewei.com; >wanghuiqiang <wanghuiqiang@huawei.com>; Linuxarm ><linuxarm@huawei.com> >Subject: Re: [PATCH v18 01/19] EDAC: Add support for EDAC device features >control > >On Mon, Jan 06, 2025 at 12:09:57PM +0000, shiju.jose@huawei.com wrote: >> +int edac_dev_register(struct device *parent, char *name, >> + void *private, int num_features, >> + const struct edac_dev_feature *ras_features) { >> + const struct attribute_group **ras_attr_groups; >> + struct edac_dev_feat_ctx *ctx; >> + int attr_gcnt = 0; >> + int ret, feat; >> + >> + if (!parent || !name || !num_features || !ras_features) >> + return -EINVAL; >> + >> + /* Double parse to make space for attributes */ >> + for (feat = 0; feat < num_features; feat++) { >> + switch (ras_features[feat].ft_type) { >> + /* Add feature specific code */ >> + default: >> + return -EINVAL; >> + } >> + } >> + >> + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); >> + if (!ctx) >> + return -ENOMEM; >> + >> + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), >GFP_KERNEL); >> + if (!ras_attr_groups) { >> + ret = -ENOMEM; >> + goto ctx_free; >> + } >> + >> + attr_gcnt = 0; >> + for (feat = 0; feat < num_features; feat++, ras_features++) { >> + switch (ras_features->ft_type) { >> + /* Add feature specific code */ >> + default: >> + ret = -EINVAL; >> + goto groups_free; >> + } >> + } >> + >> + ctx->dev.parent = parent; >> + ctx->dev.bus = edac_get_sysfs_subsys(); >> + ctx->dev.type = &edac_dev_type; >> + ctx->dev.groups = ras_attr_groups; >> + ctx->private = private; >> + dev_set_drvdata(&ctx->dev, ctx); >> + >> + ret = dev_set_name(&ctx->dev, name); >> + if (ret) >> + goto groups_free; >> + >> + ret = device_register(&ctx->dev); >> + if (ret) { >> + put_device(&ctx->dev); >> + return ret; > >Who is freeing ctx and ras_attr_groups when you return here? Hi Boris, ctx and ras_attr_groups are freed in the callback function for release edac_dev_release(struct device *dev). Thanks, Shiju > >-- >Regards/Gruss, > Boris. > >https://people.kernel.org/tglx/notes-about-netiquette
Em Mon, 6 Jan 2025 12:09:57 +0000 <shiju.jose@huawei.com> escreveu: > From: Shiju Jose <shiju.jose@huawei.com> > > Add generic EDAC device feature controls supporting the registration > of RAS features available in the system. The driver exposes control > attributes for these features to userspace in > /sys/bus/edac/devices/<dev-name>/<ras-feature>/ > > Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Signed-off-by: Shiju Jose <shiju.jose@huawei.com> > --- > Documentation/edac/features.rst | 94 ++++++++++++++++++++++++++++++ > Documentation/edac/index.rst | 10 ++++ > drivers/edac/edac_device.c | 100 ++++++++++++++++++++++++++++++++ > include/linux/edac.h | 28 +++++++++ > 4 files changed, 232 insertions(+) > create mode 100644 Documentation/edac/features.rst > create mode 100644 Documentation/edac/index.rst > > diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst > new file mode 100644 > index 000000000000..f32f259ce04d > --- /dev/null > +++ b/Documentation/edac/features.rst > @@ -0,0 +1,94 @@ > +.. SPDX-License-Identifier: GPL-2.0 SPDX should match what's written there, e. g. .. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later Please notice that GNU FDL family contains both open source and non-open source licenses. The open-source one is this: https://spdx.org/licenses/GFDL-1.2-no-invariants-or-later.html E.g. it is a the license permits changing the entire document in the future, as there's no invariant parts on it. > + > +============================================ > +Augmenting EDAC for controlling RAS features > +============================================ > + > +Copyright (c) 2024 HiSilicon Limited. 2024-2025? > + > +:Author: Shiju Jose <shiju.jose@huawei.com> > +:License: The GNU Free Documentation License, Version 1.2 > + (dual licensed under the GPL v2) You need to define if invariant parts are allowed or not, e. g.: :License: The GNU Free Documentation License, Version 1.2 without Invariant Sections, Front-Cover Texts nor Back-Cover Texts. (dual licensed under the GPL v2) > +:Original Reviewers: > + > +- Written for: 6.14 > + > +Introduction > +------------ > +The expansion of EDAC for controlling RAS features and exposing features > +control attributes to userspace via sysfs. Some Examples: > + > +* Scrub control > + > +* Error Check Scrub (ECS) control > + > +* ACPI RAS2 features > + > +* Post Package Repair (PPR) control > + > +* Memory Sparing Repair control etc. > + > +High level design is illustrated in the following diagram:: > + > + _______________________________________________ > + | Userspace - Rasdaemon | > + | _____________ | > + | | RAS CXL mem | _______________ | > + | |error handler|---->| | | > + | |_____________| | RAS dynamic | | > + | _____________ | scrub, memory | | > + | | RAS memory |---->| repair control| | > + | |error handler| |_______________| | > + | |_____________| | | > + |__________________________|____________________| > + | > + | > + _______________________________|______________________________ > + | Kernel EDAC extension for | controlling RAS Features | > + | ______________________________|____________________________ | > + || EDAC Core Sysfs EDAC| Bus | | > + || __________________________|_________ _____________ | | > + || |/sys/bus/edac/devices/<dev>/scrubX/ | | EDAC device || | > + || |/sys/bus/edac/devices/<dev>/ecsX/ |<->| EDAC MC || | > + || |/sys/bus/edac/devices/<dev>/repairX | | EDAC sysfs || | > + || |____________________________________| |_____________|| | > + || EDAC|Bus | | > + || | | | > + || __________ Get feature | Get feature | | > + || | |desc _________|______ desc __________ | | > + || |EDAC scrub|<-----| EDAC device | | | | | > + || |__________| | driver- RAS |---->| EDAC mem | | | > + || __________ | feature control| | repair | | | > + || | |<-----|________________| |__________| | | > + || |EDAC ECS | Register RAS|features | | > + || |__________| | | | > + || ______________________|_____________ | | > + ||_________|_______________|__________________|______________| | > + | _______|____ _______|_______ ____|__________ | > + | | | | CXL mem driver| | Client driver | | > + | | ACPI RAS2 | | scrub, ECS, | | memory repair | | > + | | driver | | sparing, PPR | | features | | > + | |____________| |_______________| |_______________| | > + | | | | | > + |________|_________________|____________________|______________| > + | | | > + ________|_________________|____________________|______________ > + | ___|_________________|____________________|_______ | > + | | | | > + | | Platform HW and Firmware | | > + | |__________________________________________________| | > + |______________________________________________________________| > + > + > +1. EDAC Features components - Create feature specific descriptors. > +For example, EDAC scrub, EDAC ECS, EDAC memory repair in the above > +diagram. > + > +2. EDAC device driver for controlling RAS Features - Get feature's attribute > +descriptors from EDAC RAS feature component and registers device's RAS > +features with EDAC bus and exposes the features control attributes via > +the sysfs EDAC bus. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/ > + > +3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for > +dynamic scrub/repair control to issue scrubbing/repair when excess number > +of corrected memory errors are reported in a short span of time. > diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst > new file mode 100644 > index 000000000000..b6c265a4cffb > --- /dev/null > +++ b/Documentation/edac/index.rst > @@ -0,0 +1,10 @@ > +.. SPDX-License-Identifier: GPL-2.0 > + > +============== > +EDAC Subsystem > +============== > + > +.. toctree:: > + :maxdepth: 1 > + > + features > diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c > index 621dc2a5d034..9fce46dd7405 100644 > --- a/drivers/edac/edac_device.c > +++ b/drivers/edac/edac_device.c > @@ -570,3 +570,103 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev, > block ? block->name : "N/A", count, msg); > } > EXPORT_SYMBOL_GPL(edac_device_handle_ue_count); > + > +static void edac_dev_release(struct device *dev) > +{ > + struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev); > + > + kfree(ctx->dev.groups); > + kfree(ctx); > +} > + > +const struct device_type edac_dev_type = { > + .name = "edac_dev", > + .release = edac_dev_release, > +}; > + > +static void edac_dev_unreg(void *data) > +{ > + device_unregister(data); > +} > + > +/** > + * edac_dev_register - register device for RAS features with EDAC > + * @parent: parent device. > + * @name: parent device's name. > + * @private: parent driver's data to store in the context if any. > + * @num_features: number of RAS features to register. > + * @ras_features: list of RAS features to register. > + * > + * Return: > + * * %0 - Success. > + * * %-EINVAL - Invalid parameters passed. > + * * %-ENOMEM - Dynamic memory allocation failed. > + * > + */ > +int edac_dev_register(struct device *parent, char *name, > + void *private, int num_features, > + const struct edac_dev_feature *ras_features) > +{ > + const struct attribute_group **ras_attr_groups; > + struct edac_dev_feat_ctx *ctx; > + int attr_gcnt = 0; > + int ret, feat; > + > + if (!parent || !name || !num_features || !ras_features) > + return -EINVAL; > + > + /* Double parse to make space for attributes */ > + for (feat = 0; feat < num_features; feat++) { > + switch (ras_features[feat].ft_type) { > + /* Add feature specific code */ > + default: > + return -EINVAL; > + } > + } > + > + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); > + if (!ctx) > + return -ENOMEM; > + > + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL); > + if (!ras_attr_groups) { > + ret = -ENOMEM; > + goto ctx_free; > + } > + > + attr_gcnt = 0; > + for (feat = 0; feat < num_features; feat++, ras_features++) { > + switch (ras_features->ft_type) { > + /* Add feature specific code */ > + default: > + ret = -EINVAL; > + goto groups_free; > + } > + } > + > + ctx->dev.parent = parent; > + ctx->dev.bus = edac_get_sysfs_subsys(); > + ctx->dev.type = &edac_dev_type; > + ctx->dev.groups = ras_attr_groups; > + ctx->private = private; > + dev_set_drvdata(&ctx->dev, ctx); > + > + ret = dev_set_name(&ctx->dev, name); > + if (ret) > + goto groups_free; > + > + ret = device_register(&ctx->dev); > + if (ret) { > + put_device(&ctx->dev); > + return ret; As register failed, you need to change it to a goto groups_free, as edac_dev_release() won't be called. > + } > + > + return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev); > + > +groups_free: > + kfree(ras_attr_groups); > +ctx_free: > + kfree(ctx); > + return ret; > +} > +EXPORT_SYMBOL_GPL(edac_dev_register); > diff --git a/include/linux/edac.h b/include/linux/edac.h > index b4ee8961e623..521b17113d4d 100644 > --- a/include/linux/edac.h > +++ b/include/linux/edac.h > @@ -661,4 +661,32 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, > > return mci->dimms[index]; > } > + > +#define EDAC_FEAT_NAME_LEN 128 This macro was not used on this patch. > + > +/* RAS feature type */ > +enum edac_dev_feat { > + RAS_FEAT_MAX > +}; > + > +/* EDAC device feature information structure */ > +struct edac_dev_data { > + u8 instance; > + void *private; > +}; > + > +struct edac_dev_feat_ctx { > + struct device dev; > + void *private; > +}; > + > +struct edac_dev_feature { > + enum edac_dev_feat ft_type; > + u8 instance; > + void *ctx; > +}; > + > +int edac_dev_register(struct device *parent, char *dev_name, > + void *parent_pvt_data, int num_features, > + const struct edac_dev_feature *ras_features); > #endif /* _LINUX_EDAC_H_ */ Thanks, Mauro
> > +int edac_dev_register(struct device *parent, char *name, > > + void *private, int num_features, > > + const struct edac_dev_feature *ras_features) > > +{ > > + const struct attribute_group **ras_attr_groups; > > + struct edac_dev_feat_ctx *ctx; > > + int attr_gcnt = 0; > > + int ret, feat; > > + > > + if (!parent || !name || !num_features || !ras_features) > > + return -EINVAL; > > + > > + /* Double parse to make space for attributes */ > > + for (feat = 0; feat < num_features; feat++) { > > + switch (ras_features[feat].ft_type) { > > + /* Add feature specific code */ > > + default: > > + return -EINVAL; > > + } > > + } > > + > > + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); > > + if (!ctx) > > + return -ENOMEM; > > + > > + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL); > > + if (!ras_attr_groups) { > > + ret = -ENOMEM; > > + goto ctx_free; > > + } > > + > > + attr_gcnt = 0; > > + for (feat = 0; feat < num_features; feat++, ras_features++) { > > + switch (ras_features->ft_type) { > > + /* Add feature specific code */ > > + default: > > + ret = -EINVAL; > > + goto groups_free; > > + } > > + } > > + > > + ctx->dev.parent = parent; > > + ctx->dev.bus = edac_get_sysfs_subsys(); > > + ctx->dev.type = &edac_dev_type; > > + ctx->dev.groups = ras_attr_groups; > > + ctx->private = private; > > + dev_set_drvdata(&ctx->dev, ctx); > > + > > + ret = dev_set_name(&ctx->dev, name); > > + if (ret) > > + goto groups_free; > > + > > + ret = device_register(&ctx->dev); > > + if (ret) { > > + put_device(&ctx->dev); > > > + return ret; > > As register failed, you need to change it to a goto groups_free, > as edac_dev_release() won't be called. Boris called this one out as well, so seems it is not that well understood. I've also tripped over this in the past and it's one of the most common things I catch in reviews of code calling this stuff. As discussed offline, it will be called. The device_register() docs make it clear that whether or not that call succeeds reference counting is enabled and put_device() is the correct way to free resources. The actual depends on the fact that device_register() is just a helper defined as device_initialize(); return device_add(); So for reasons lost to history (I guess there are cases where other cleanup needs to happen before the release) it does not handle side effects of device_initialize() on an error in device_add(). device_initialize() has called -> kobject_init(&dev->kobj, &device_type); -> kref_init_internal(kobj) + sets ktype (which has the release callback) kref_init_internal() sets the reference counter to 1 Hence when we do a device_put() in the error path, the reference counter drops to 0 and the release from the ktype is called. Here that is edac_dev_release(); If you want to verify replace device_register() with device_initialize() then call put_device(). If we were going back in history, I'd suggest device_register() should be side effect free and call put_device() on error and any driver that needs to handle other stuff before the release should just not use it. I guess that ship long sailed and maybe there are other reasons I've not thought of. I took a quick look and seems to go back into at least the 2.5 era. Jonathan
>-----Original Message----- >From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> >Sent: 13 January 2025 15:06 >To: Shiju Jose <shiju.jose@huawei.com> >Cc: linux-edac@vger.kernel.org; linux-cxl@vger.kernel.org; linux- >acpi@vger.kernel.org; linux-mm@kvack.org; linux-kernel@vger.kernel.org; >bp@alien8.de; tony.luck@intel.com; rafael@kernel.org; lenb@kernel.org; >mchehab@kernel.org; dan.j.williams@intel.com; dave@stgolabs.net; Jonathan >Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com; >alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com; >david@redhat.com; Vilas.Sridharan@amd.com; leo.duran@amd.com; >Yazen.Ghannam@amd.com; rientjes@google.com; jiaqiyan@google.com; >Jon.Grimm@amd.com; dave.hansen@linux.intel.com; >naoya.horiguchi@nec.com; james.morse@arm.com; jthoughton@google.com; >somasundaram.a@hpe.com; erdemaktas@google.com; pgonda@google.com; >duenwen@google.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; Roberto >Sassu <roberto.sassu@huawei.com>; kangkang.shen@futurewei.com; >wanghuiqiang <wanghuiqiang@huawei.com>; Linuxarm ><linuxarm@huawei.com> >Subject: Re: [PATCH v18 01/19] EDAC: Add support for EDAC device features >control > >Em Mon, 6 Jan 2025 12:09:57 +0000 ><shiju.jose@huawei.com> escreveu: > >> From: Shiju Jose <shiju.jose@huawei.com> >> >> Add generic EDAC device feature controls supporting the registration >> of RAS features available in the system. The driver exposes control >> attributes for these features to userspace in >> /sys/bus/edac/devices/<dev-name>/<ras-feature>/ >> >> Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> >> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> >> Signed-off-by: Shiju Jose <shiju.jose@huawei.com> >> --- >> Documentation/edac/features.rst | 94 ++++++++++++++++++++++++++++++ >> Documentation/edac/index.rst | 10 ++++ >> drivers/edac/edac_device.c | 100 ++++++++++++++++++++++++++++++++ >> include/linux/edac.h | 28 +++++++++ >> 4 files changed, 232 insertions(+) >> create mode 100644 Documentation/edac/features.rst create mode >> 100644 Documentation/edac/index.rst >> >> diff --git a/Documentation/edac/features.rst >> b/Documentation/edac/features.rst new file mode 100644 index >> 000000000000..f32f259ce04d >> --- /dev/null >> +++ b/Documentation/edac/features.rst >> @@ -0,0 +1,94 @@ >> +.. SPDX-License-Identifier: GPL-2.0 > >SPDX should match what's written there, e. g. > > .. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later > >Please notice that GNU FDL family contains both open source and non-open >source licenses. The open-source one is this: > > https://spdx.org/licenses/GFDL-1.2-no-invariants-or-later.html > >E.g. it is a the license permits changing the entire document in the future, as >there's no invariant parts on it. This seems not widely used, have seen this is used in few documents only. > >> + >> +============================================ >> +Augmenting EDAC for controlling RAS features >> +============================================ >> + >> +Copyright (c) 2024 HiSilicon Limited. > >2024-2025? Will do. > >> + >> +:Author: Shiju Jose <shiju.jose@huawei.com> >> +:License: The GNU Free Documentation License, Version 1.2 >> + (dual licensed under the GPL v2) > >You need to define if invariant parts are allowed or not, e. g.: > > :License: The GNU Free Documentation License, Version 1.2 without >Invariant Sections, Front-Cover Texts nor Back-Cover Texts. > (dual licensed under the GPL v2) Same as above. > > >> +:Original Reviewers: >> + >> +- Written for: 6.14 >> + >> +Introduction >> +------------ >> +The expansion of EDAC for controlling RAS features and exposing >> +features control attributes to userspace via sysfs. Some Examples: >> + >> +* Scrub control >> + >> +* Error Check Scrub (ECS) control >> + >> +* ACPI RAS2 features >> + >> +* Post Package Repair (PPR) control >> + >> +* Memory Sparing Repair control etc. >> + >> +High level design is illustrated in the following diagram:: >> + >> + _______________________________________________ >> + | Userspace - Rasdaemon | >> + | _____________ | >> + | | RAS CXL mem | _______________ | >> + | |error handler|---->| | | >> + | |_____________| | RAS dynamic | | >> + | _____________ | scrub, memory | | >> + | | RAS memory |---->| repair control| | >> + | |error handler| |_______________| | >> + | |_____________| | | >> + |__________________________|____________________| >> + | >> + | >> + >_______________________________|______________________________ >> + | Kernel EDAC extension for | controlling RAS Features | >> + | ______________________________|____________________________ >| >> + || EDAC Core Sysfs EDAC| Bus | | >> + || __________________________|_________ _____________ | | >> + || |/sys/bus/edac/devices/<dev>/scrubX/ | | EDAC device || | >> + || |/sys/bus/edac/devices/<dev>/ecsX/ |<->| EDAC MC || | >> + || |/sys/bus/edac/devices/<dev>/repairX | | EDAC sysfs || | >> + || |____________________________________| |_____________|| | >> + || EDAC|Bus | | >> + || | | | >> + || __________ Get feature | Get feature | | >> + || | |desc _________|______ desc __________ | | >> + || |EDAC scrub|<-----| EDAC device | | | | | >> + || |__________| | driver- RAS |---->| EDAC mem | | | >> + || __________ | feature control| | repair | | | >> + || | |<-----|________________| |__________| | | >> + || |EDAC ECS | Register RAS|features | | >> + || |__________| | | | >> + || ______________________|_____________ | | >> + ||_________|_______________|__________________|______________| >| >> + | _______|____ _______|_______ ____|__________ | >> + | | | | CXL mem driver| | Client driver | | >> + | | ACPI RAS2 | | scrub, ECS, | | memory repair | | >> + | | driver | | sparing, PPR | | features | | >> + | |____________| |_______________| |_______________| | >> + | | | | | >> + >|________|_________________|____________________|______________| >> + | | | >> + >________|_________________|____________________|______________ >> + | ___|_________________|____________________|_______ | >> + | | | | >> + | | Platform HW and Firmware | | >> + | |__________________________________________________| | >> + >|______________________________________________________________| >> + >> + >> +1. EDAC Features components - Create feature specific descriptors. >> +For example, EDAC scrub, EDAC ECS, EDAC memory repair in the above >> +diagram. >> + >> +2. EDAC device driver for controlling RAS Features - Get feature's >> +attribute descriptors from EDAC RAS feature component and registers >> +device's RAS features with EDAC bus and exposes the features control >> +attributes via the sysfs EDAC bus. For example, >> +/sys/bus/edac/devices/<dev-name>/<feature>X/ >> + >> +3. RAS dynamic feature controller - Userspace sample modules in >> +rasdaemon for dynamic scrub/repair control to issue scrubbing/repair >> +when excess number of corrected memory errors are reported in a short >span of time. >> diff --git a/Documentation/edac/index.rst >> b/Documentation/edac/index.rst new file mode 100644 index >> 000000000000..b6c265a4cffb >> --- /dev/null >> +++ b/Documentation/edac/index.rst >> @@ -0,0 +1,10 @@ >> +.. SPDX-License-Identifier: GPL-2.0 >> + >> +============== >> +EDAC Subsystem >> +============== >> + >> +.. toctree:: >> + :maxdepth: 1 >> + >> + features >> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c >> index 621dc2a5d034..9fce46dd7405 100644 >> --- a/drivers/edac/edac_device.c >> +++ b/drivers/edac/edac_device.c >> @@ -570,3 +570,103 @@ void edac_device_handle_ue_count(struct >edac_device_ctl_info *edac_dev, >> block ? block->name : "N/A", count, msg); } >> EXPORT_SYMBOL_GPL(edac_device_handle_ue_count); >> + >> +static void edac_dev_release(struct device *dev) { >> + struct edac_dev_feat_ctx *ctx = container_of(dev, struct >> +edac_dev_feat_ctx, dev); >> + >> + kfree(ctx->dev.groups); >> + kfree(ctx); >> +} >> + >> +const struct device_type edac_dev_type = { >> + .name = "edac_dev", >> + .release = edac_dev_release, >> +}; >> + >> +static void edac_dev_unreg(void *data) { >> + device_unregister(data); >> +} >> + >> +/** >> + * edac_dev_register - register device for RAS features with EDAC >> + * @parent: parent device. >> + * @name: parent device's name. >> + * @private: parent driver's data to store in the context if any. >> + * @num_features: number of RAS features to register. >> + * @ras_features: list of RAS features to register. >> + * >> + * Return: >> + * * %0 - Success. >> + * * %-EINVAL - Invalid parameters passed. >> + * * %-ENOMEM - Dynamic memory allocation failed. >> + * >> + */ >> +int edac_dev_register(struct device *parent, char *name, >> + void *private, int num_features, >> + const struct edac_dev_feature *ras_features) { >> + const struct attribute_group **ras_attr_groups; >> + struct edac_dev_feat_ctx *ctx; >> + int attr_gcnt = 0; >> + int ret, feat; >> + >> + if (!parent || !name || !num_features || !ras_features) >> + return -EINVAL; >> + >> + /* Double parse to make space for attributes */ >> + for (feat = 0; feat < num_features; feat++) { >> + switch (ras_features[feat].ft_type) { >> + /* Add feature specific code */ >> + default: >> + return -EINVAL; >> + } >> + } >> + >> + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); >> + if (!ctx) >> + return -ENOMEM; >> + >> + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), >GFP_KERNEL); >> + if (!ras_attr_groups) { >> + ret = -ENOMEM; >> + goto ctx_free; >> + } >> + >> + attr_gcnt = 0; >> + for (feat = 0; feat < num_features; feat++, ras_features++) { >> + switch (ras_features->ft_type) { >> + /* Add feature specific code */ >> + default: >> + ret = -EINVAL; >> + goto groups_free; >> + } >> + } >> + >> + ctx->dev.parent = parent; >> + ctx->dev.bus = edac_get_sysfs_subsys(); >> + ctx->dev.type = &edac_dev_type; >> + ctx->dev.groups = ras_attr_groups; >> + ctx->private = private; >> + dev_set_drvdata(&ctx->dev, ctx); >> + >> + ret = dev_set_name(&ctx->dev, name); >> + if (ret) >> + goto groups_free; >> + >> + ret = device_register(&ctx->dev); >> + if (ret) { >> + put_device(&ctx->dev); > >> + return ret; > >As register failed, you need to change it to a goto groups_free, as >edac_dev_release() won't be called. As per experimentation edac_dev_release() will be called. > >> + } >> + >> + return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev); >> + >> +groups_free: >> + kfree(ras_attr_groups); >> +ctx_free: >> + kfree(ctx); >> + return ret; >> +} >> +EXPORT_SYMBOL_GPL(edac_dev_register); >> diff --git a/include/linux/edac.h b/include/linux/edac.h index >> b4ee8961e623..521b17113d4d 100644 >> --- a/include/linux/edac.h >> +++ b/include/linux/edac.h >> @@ -661,4 +661,32 @@ static inline struct dimm_info >> *edac_get_dimm(struct mem_ctl_info *mci, >> >> return mci->dimms[index]; >> } >> + >> +#define EDAC_FEAT_NAME_LEN 128 > >This macro was not used on this patch. Sure. > >> + >> +/* RAS feature type */ >> +enum edac_dev_feat { >> + RAS_FEAT_MAX >> +}; >> + >> +/* EDAC device feature information structure */ struct edac_dev_data >> +{ >> + u8 instance; >> + void *private; >> +}; >> + >> +struct edac_dev_feat_ctx { >> + struct device dev; >> + void *private; >> +}; >> + >> +struct edac_dev_feature { >> + enum edac_dev_feat ft_type; >> + u8 instance; >> + void *ctx; >> +}; >> + >> +int edac_dev_register(struct device *parent, char *dev_name, >> + void *parent_pvt_data, int num_features, >> + const struct edac_dev_feature *ras_features); >> #endif /* _LINUX_EDAC_H_ */ > >Thanks, >Mauro Thanks, Shiju
Em Tue, 14 Jan 2025 10:08:42 +0000 Shiju Jose <shiju.jose@huawei.com> escreveu: > >> diff --git a/Documentation/edac/features.rst > >> b/Documentation/edac/features.rst new file mode 100644 index > >> 000000000000..f32f259ce04d > >> --- /dev/null > >> +++ b/Documentation/edac/features.rst > >> @@ -0,0 +1,94 @@ > >> +.. SPDX-License-Identifier: GPL-2.0 > > > >SPDX should match what's written there, e. g. > > > > .. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later > > > >Please notice that GNU FDL family contains both open source and non-open > >source licenses. The open-source one is this: > > > > https://spdx.org/licenses/GFDL-1.2-no-invariants-or-later.html > > > >E.g. it is a the license permits changing the entire document in the future, as > >there's no invariant parts on it. > This seems not widely used, have seen this is used in few documents only. This was added after some discussions I had with LF people in charge of SPDX: GFDL explicitly allows to have some parts that can't be touched by future patches. Those are "invariant" parts of the document. They were designed in a way that the original author's notes can't be touched on any further patch from someone's else. You can see more about that at: https://www.gnu.org/licenses/fdl-howto-opt.en.html See: "The classical example of an invariant nontechnical section in a free manual is the GNU Manifesto, which is included in the GNU Emacs Manual. The GNU Manifesto says nothing about how to edit with Emacs, but it explains the reason why I wrote GNU Emacs" And: https://www.gnu.org/gnu/manifesto.html Due to its nature of being invariant, most people consider it as a non-open-source license. See, for instance: https://www.debian.org/vote/2006/vote_001 Due to such concerns, after several discussions I had with interested parties, this was added to SPDX spec and to the Linux Kernel: - GFDL-1.2-no-invariants-only - for GFDL v 1.2 only - GFDL-1.2-no-invariants-or-later - for GFDL v 1.2 or later (plus variants for other GFDL versions) You may use either one of them, but you should *not* use GFDL-1.2 as this is deprecated: https://spdx.org/licenses/GFDL-1.2.html And need to be replaced by either: https://spdx.org/licenses/GFDL-1.2-no-invariants-or-later.html or: https://spdx.org/licenses/GFDL-1.2-invariants-only.html Thanks, Mauro
diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst new file mode 100644 index 000000000000..f32f259ce04d --- /dev/null +++ b/Documentation/edac/features.rst @@ -0,0 +1,94 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================ +Augmenting EDAC for controlling RAS features +============================================ + +Copyright (c) 2024 HiSilicon Limited. + +:Author: Shiju Jose <shiju.jose@huawei.com> +:License: The GNU Free Documentation License, Version 1.2 + (dual licensed under the GPL v2) +:Original Reviewers: + +- Written for: 6.14 + +Introduction +------------ +The expansion of EDAC for controlling RAS features and exposing features +control attributes to userspace via sysfs. Some Examples: + +* Scrub control + +* Error Check Scrub (ECS) control + +* ACPI RAS2 features + +* Post Package Repair (PPR) control + +* Memory Sparing Repair control etc. + +High level design is illustrated in the following diagram:: + + _______________________________________________ + | Userspace - Rasdaemon | + | _____________ | + | | RAS CXL mem | _______________ | + | |error handler|---->| | | + | |_____________| | RAS dynamic | | + | _____________ | scrub, memory | | + | | RAS memory |---->| repair control| | + | |error handler| |_______________| | + | |_____________| | | + |__________________________|____________________| + | + | + _______________________________|______________________________ + | Kernel EDAC extension for | controlling RAS Features | + | ______________________________|____________________________ | + || EDAC Core Sysfs EDAC| Bus | | + || __________________________|_________ _____________ | | + || |/sys/bus/edac/devices/<dev>/scrubX/ | | EDAC device || | + || |/sys/bus/edac/devices/<dev>/ecsX/ |<->| EDAC MC || | + || |/sys/bus/edac/devices/<dev>/repairX | | EDAC sysfs || | + || |____________________________________| |_____________|| | + || EDAC|Bus | | + || | | | + || __________ Get feature | Get feature | | + || | |desc _________|______ desc __________ | | + || |EDAC scrub|<-----| EDAC device | | | | | + || |__________| | driver- RAS |---->| EDAC mem | | | + || __________ | feature control| | repair | | | + || | |<-----|________________| |__________| | | + || |EDAC ECS | Register RAS|features | | + || |__________| | | | + || ______________________|_____________ | | + ||_________|_______________|__________________|______________| | + | _______|____ _______|_______ ____|__________ | + | | | | CXL mem driver| | Client driver | | + | | ACPI RAS2 | | scrub, ECS, | | memory repair | | + | | driver | | sparing, PPR | | features | | + | |____________| |_______________| |_______________| | + | | | | | + |________|_________________|____________________|______________| + | | | + ________|_________________|____________________|______________ + | ___|_________________|____________________|_______ | + | | | | + | | Platform HW and Firmware | | + | |__________________________________________________| | + |______________________________________________________________| + + +1. EDAC Features components - Create feature specific descriptors. +For example, EDAC scrub, EDAC ECS, EDAC memory repair in the above +diagram. + +2. EDAC device driver for controlling RAS Features - Get feature's attribute +descriptors from EDAC RAS feature component and registers device's RAS +features with EDAC bus and exposes the features control attributes via +the sysfs EDAC bus. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/ + +3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for +dynamic scrub/repair control to issue scrubbing/repair when excess number +of corrected memory errors are reported in a short span of time. diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst new file mode 100644 index 000000000000..b6c265a4cffb --- /dev/null +++ b/Documentation/edac/index.rst @@ -0,0 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +EDAC Subsystem +============== + +.. toctree:: + :maxdepth: 1 + + features diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 621dc2a5d034..9fce46dd7405 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -570,3 +570,103 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev, block ? block->name : "N/A", count, msg); } EXPORT_SYMBOL_GPL(edac_device_handle_ue_count); + +static void edac_dev_release(struct device *dev) +{ + struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev); + + kfree(ctx->dev.groups); + kfree(ctx); +} + +const struct device_type edac_dev_type = { + .name = "edac_dev", + .release = edac_dev_release, +}; + +static void edac_dev_unreg(void *data) +{ + device_unregister(data); +} + +/** + * edac_dev_register - register device for RAS features with EDAC + * @parent: parent device. + * @name: parent device's name. + * @private: parent driver's data to store in the context if any. + * @num_features: number of RAS features to register. + * @ras_features: list of RAS features to register. + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + * + */ +int edac_dev_register(struct device *parent, char *name, + void *private, int num_features, + const struct edac_dev_feature *ras_features) +{ + const struct attribute_group **ras_attr_groups; + struct edac_dev_feat_ctx *ctx; + int attr_gcnt = 0; + int ret, feat; + + if (!parent || !name || !num_features || !ras_features) + return -EINVAL; + + /* Double parse to make space for attributes */ + for (feat = 0; feat < num_features; feat++) { + switch (ras_features[feat].ft_type) { + /* Add feature specific code */ + default: + return -EINVAL; + } + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL); + if (!ras_attr_groups) { + ret = -ENOMEM; + goto ctx_free; + } + + attr_gcnt = 0; + for (feat = 0; feat < num_features; feat++, ras_features++) { + switch (ras_features->ft_type) { + /* Add feature specific code */ + default: + ret = -EINVAL; + goto groups_free; + } + } + + ctx->dev.parent = parent; + ctx->dev.bus = edac_get_sysfs_subsys(); + ctx->dev.type = &edac_dev_type; + ctx->dev.groups = ras_attr_groups; + ctx->private = private; + dev_set_drvdata(&ctx->dev, ctx); + + ret = dev_set_name(&ctx->dev, name); + if (ret) + goto groups_free; + + ret = device_register(&ctx->dev); + if (ret) { + put_device(&ctx->dev); + return ret; + } + + return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev); + +groups_free: + kfree(ras_attr_groups); +ctx_free: + kfree(ctx); + return ret; +} +EXPORT_SYMBOL_GPL(edac_dev_register); diff --git a/include/linux/edac.h b/include/linux/edac.h index b4ee8961e623..521b17113d4d 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -661,4 +661,32 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, return mci->dimms[index]; } + +#define EDAC_FEAT_NAME_LEN 128 + +/* RAS feature type */ +enum edac_dev_feat { + RAS_FEAT_MAX +}; + +/* EDAC device feature information structure */ +struct edac_dev_data { + u8 instance; + void *private; +}; + +struct edac_dev_feat_ctx { + struct device dev; + void *private; +}; + +struct edac_dev_feature { + enum edac_dev_feat ft_type; + u8 instance; + void *ctx; +}; + +int edac_dev_register(struct device *parent, char *dev_name, + void *parent_pvt_data, int num_features, + const struct edac_dev_feature *ras_features); #endif /* _LINUX_EDAC_H_ */