diff mbox series

[v18,01/19] EDAC: Add support for EDAC device features control

Message ID 20250106121017.1620-2-shiju.jose@huawei.com (mailing list archive)
State Handled Elsewhere, archived
Headers show
Series EDAC: Scrub: introduce generic EDAC RAS control feature driver + CXL/ACPI-RAS2 drivers | expand

Commit Message

Shiju Jose Jan. 6, 2025, 12:09 p.m. UTC
From: Shiju Jose <shiju.jose@huawei.com>

Add generic EDAC device feature controls supporting the registration
of RAS features available in the system. The driver exposes control
attributes for these features to userspace in
/sys/bus/edac/devices/<dev-name>/<ras-feature>/

Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 Documentation/edac/features.rst |  94 ++++++++++++++++++++++++++++++
 Documentation/edac/index.rst    |  10 ++++
 drivers/edac/edac_device.c      | 100 ++++++++++++++++++++++++++++++++
 include/linux/edac.h            |  28 +++++++++
 4 files changed, 232 insertions(+)
 create mode 100644 Documentation/edac/features.rst
 create mode 100644 Documentation/edac/index.rst

Comments

Borislav Petkov Jan. 6, 2025, 1:37 p.m. UTC | #1
On Mon, Jan 06, 2025 at 12:09:57PM +0000, shiju.jose@huawei.com wrote:
> +int edac_dev_register(struct device *parent, char *name,
> +		      void *private, int num_features,
> +		      const struct edac_dev_feature *ras_features)
> +{
> +	const struct attribute_group **ras_attr_groups;
> +	struct edac_dev_feat_ctx *ctx;
> +	int attr_gcnt = 0;
> +	int ret, feat;
> +
> +	if (!parent || !name || !num_features || !ras_features)
> +		return -EINVAL;
> +
> +	/* Double parse to make space for attributes */
> +	for (feat = 0; feat < num_features; feat++) {
> +		switch (ras_features[feat].ft_type) {
> +		/* Add feature specific code */
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> +	if (!ctx)
> +		return -ENOMEM;
> +
> +	ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL);
> +	if (!ras_attr_groups) {
> +		ret = -ENOMEM;
> +		goto ctx_free;
> +	}
> +
> +	attr_gcnt = 0;
> +	for (feat = 0; feat < num_features; feat++, ras_features++) {
> +		switch (ras_features->ft_type) {
> +		/* Add feature specific code */
> +		default:
> +			ret = -EINVAL;
> +			goto groups_free;
> +		}
> +	}
> +
> +	ctx->dev.parent = parent;
> +	ctx->dev.bus = edac_get_sysfs_subsys();
> +	ctx->dev.type = &edac_dev_type;
> +	ctx->dev.groups = ras_attr_groups;
> +	ctx->private = private;
> +	dev_set_drvdata(&ctx->dev, ctx);
> +
> +	ret = dev_set_name(&ctx->dev, name);
> +	if (ret)
> +		goto groups_free;
> +
> +	ret = device_register(&ctx->dev);
> +	if (ret) {
> +		put_device(&ctx->dev);
> +		return ret;

Who is freeing ctx and ras_attr_groups when you return here?
Shiju Jose Jan. 6, 2025, 2:48 p.m. UTC | #2
>-----Original Message-----
>From: Borislav Petkov <bp@alien8.de>
>Sent: 06 January 2025 13:38
>To: Shiju Jose <shiju.jose@huawei.com>
>Cc: linux-edac@vger.kernel.org; linux-cxl@vger.kernel.org; linux-
>acpi@vger.kernel.org; linux-mm@kvack.org; linux-kernel@vger.kernel.org;
>tony.luck@intel.com; rafael@kernel.org; lenb@kernel.org;
>mchehab@kernel.org; dan.j.williams@intel.com; dave@stgolabs.net; Jonathan
>Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com;
>alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com;
>david@redhat.com; Vilas.Sridharan@amd.com; leo.duran@amd.com;
>Yazen.Ghannam@amd.com; rientjes@google.com; jiaqiyan@google.com;
>Jon.Grimm@amd.com; dave.hansen@linux.intel.com;
>naoya.horiguchi@nec.com; james.morse@arm.com; jthoughton@google.com;
>somasundaram.a@hpe.com; erdemaktas@google.com; pgonda@google.com;
>duenwen@google.com; gthelen@google.com;
>wschwartz@amperecomputing.com; dferguson@amperecomputing.com;
>wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei
><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; Roberto
>Sassu <roberto.sassu@huawei.com>; kangkang.shen@futurewei.com;
>wanghuiqiang <wanghuiqiang@huawei.com>; Linuxarm
><linuxarm@huawei.com>
>Subject: Re: [PATCH v18 01/19] EDAC: Add support for EDAC device features
>control
>
>On Mon, Jan 06, 2025 at 12:09:57PM +0000, shiju.jose@huawei.com wrote:
>> +int edac_dev_register(struct device *parent, char *name,
>> +		      void *private, int num_features,
>> +		      const struct edac_dev_feature *ras_features) {
>> +	const struct attribute_group **ras_attr_groups;
>> +	struct edac_dev_feat_ctx *ctx;
>> +	int attr_gcnt = 0;
>> +	int ret, feat;
>> +
>> +	if (!parent || !name || !num_features || !ras_features)
>> +		return -EINVAL;
>> +
>> +	/* Double parse to make space for attributes */
>> +	for (feat = 0; feat < num_features; feat++) {
>> +		switch (ras_features[feat].ft_type) {
>> +		/* Add feature specific code */
>> +		default:
>> +			return -EINVAL;
>> +		}
>> +	}
>> +
>> +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
>> +	if (!ctx)
>> +		return -ENOMEM;
>> +
>> +	ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups),
>GFP_KERNEL);
>> +	if (!ras_attr_groups) {
>> +		ret = -ENOMEM;
>> +		goto ctx_free;
>> +	}
>> +
>> +	attr_gcnt = 0;
>> +	for (feat = 0; feat < num_features; feat++, ras_features++) {
>> +		switch (ras_features->ft_type) {
>> +		/* Add feature specific code */
>> +		default:
>> +			ret = -EINVAL;
>> +			goto groups_free;
>> +		}
>> +	}
>> +
>> +	ctx->dev.parent = parent;
>> +	ctx->dev.bus = edac_get_sysfs_subsys();
>> +	ctx->dev.type = &edac_dev_type;
>> +	ctx->dev.groups = ras_attr_groups;
>> +	ctx->private = private;
>> +	dev_set_drvdata(&ctx->dev, ctx);
>> +
>> +	ret = dev_set_name(&ctx->dev, name);
>> +	if (ret)
>> +		goto groups_free;
>> +
>> +	ret = device_register(&ctx->dev);
>> +	if (ret) {
>> +		put_device(&ctx->dev);
>> +		return ret;
>
>Who is freeing ctx and ras_attr_groups when you return here?

Hi Boris,

ctx and ras_attr_groups are freed in the callback
function  for release  edac_dev_release(struct device *dev).

Thanks,
Shiju

>
>--
>Regards/Gruss,
>    Boris.
>
>https://people.kernel.org/tglx/notes-about-netiquette
Mauro Carvalho Chehab Jan. 13, 2025, 3:06 p.m. UTC | #3
Em Mon, 6 Jan 2025 12:09:57 +0000
<shiju.jose@huawei.com> escreveu:

> From: Shiju Jose <shiju.jose@huawei.com>
> 
> Add generic EDAC device feature controls supporting the registration
> of RAS features available in the system. The driver exposes control
> attributes for these features to userspace in
> /sys/bus/edac/devices/<dev-name>/<ras-feature>/
> 
> Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
> ---
>  Documentation/edac/features.rst |  94 ++++++++++++++++++++++++++++++
>  Documentation/edac/index.rst    |  10 ++++
>  drivers/edac/edac_device.c      | 100 ++++++++++++++++++++++++++++++++
>  include/linux/edac.h            |  28 +++++++++
>  4 files changed, 232 insertions(+)
>  create mode 100644 Documentation/edac/features.rst
>  create mode 100644 Documentation/edac/index.rst
> 
> diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst
> new file mode 100644
> index 000000000000..f32f259ce04d
> --- /dev/null
> +++ b/Documentation/edac/features.rst
> @@ -0,0 +1,94 @@
> +.. SPDX-License-Identifier: GPL-2.0

SPDX should match what's written there, e. g.

	.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later

Please notice that GNU FDL family contains both open source and non-open
source licenses. The open-source one is this:

	https://spdx.org/licenses/GFDL-1.2-no-invariants-or-later.html

E.g. it is a the license permits changing the entire document in
the future, as there's no invariant parts on it.

> +
> +============================================
> +Augmenting EDAC for controlling RAS features
> +============================================
> +
> +Copyright (c) 2024 HiSilicon Limited.

2024-2025?

> +
> +:Author:   Shiju Jose <shiju.jose@huawei.com>
> +:License:  The GNU Free Documentation License, Version 1.2
> +          (dual licensed under the GPL v2)

You need to define if invariant parts are allowed or not, e. g.:

	:License: The GNU Free Documentation License, Version 1.2 without Invariant Sections, Front-Cover Texts nor Back-Cover Texts.
		  (dual licensed under the GPL v2)


> +:Original Reviewers:
> +
> +- Written for: 6.14
> +
> +Introduction
> +------------
> +The expansion of EDAC for controlling RAS features and exposing features
> +control attributes to userspace via sysfs. Some Examples:
> +
> +* Scrub control
> +
> +* Error Check Scrub (ECS) control
> +
> +* ACPI RAS2 features
> +
> +* Post Package Repair (PPR) control
> +
> +* Memory Sparing Repair control etc.
> +
> +High level design is illustrated in the following diagram::
> +
> +         _______________________________________________
> +        |   Userspace - Rasdaemon                       |
> +        |  _____________                                |
> +        | | RAS CXL mem |      _______________          |
> +        | |error handler|---->|               |         |
> +        | |_____________|     | RAS dynamic   |         |
> +        |  _____________      | scrub, memory |         |
> +        | | RAS memory  |---->| repair control|         |
> +        | |error handler|     |_______________|         |
> +        | |_____________|          |                    |
> +        |__________________________|____________________|
> +                                   |
> +                                   |
> +    _______________________________|______________________________
> +   |     Kernel EDAC extension for | controlling RAS Features     |
> +   | ______________________________|____________________________  |
> +   || EDAC Core          Sysfs EDAC| Bus                        | |
> +   ||    __________________________|_________     _____________ | |
> +   ||   |/sys/bus/edac/devices/<dev>/scrubX/ |   | EDAC device || |
> +   ||   |/sys/bus/edac/devices/<dev>/ecsX/   |<->| EDAC MC     || |
> +   ||   |/sys/bus/edac/devices/<dev>/repairX |   | EDAC sysfs  || |
> +   ||   |____________________________________|   |_____________|| |
> +   ||                           EDAC|Bus                        | |
> +   ||                               |                           | |
> +   ||    __________ Get feature     |      Get feature          | |
> +   ||   |          |desc   _________|______ desc  __________    | |
> +   ||   |EDAC scrub|<-----| EDAC device    |     |          |   | |
> +   ||   |__________|      | driver- RAS    |---->| EDAC mem |   | |
> +   ||    __________       | feature control|     | repair   |   | |
> +   ||   |          |<-----|________________|     |__________|   | |
> +   ||   |EDAC ECS  |    Register RAS|features                   | |
> +   ||   |__________|                |                           | |
> +   ||         ______________________|_____________              | |
> +   ||_________|_______________|__________________|______________| |
> +   |   _______|____    _______|_______       ____|__________      |
> +   |  |            |  | CXL mem driver|     | Client driver |     |
> +   |  | ACPI RAS2  |  | scrub, ECS,   |     | memory repair |     |
> +   |  | driver     |  | sparing, PPR  |     | features      |     |
> +   |  |____________|  |_______________|     |_______________|     |
> +   |        |                 |                    |              |
> +   |________|_________________|____________________|______________|
> +            |                 |                    |
> +    ________|_________________|____________________|______________
> +   |     ___|_________________|____________________|_______       |
> +   |    |                                                  |      |
> +   |    |            Platform HW and Firmware              |      |
> +   |    |__________________________________________________|      |
> +   |______________________________________________________________|
> +
> +
> +1. EDAC Features components - Create feature specific descriptors.
> +For example, EDAC scrub, EDAC ECS, EDAC memory repair in the above
> +diagram.
> +
> +2. EDAC device driver for controlling RAS Features - Get feature's attribute
> +descriptors from EDAC RAS feature component and registers device's RAS
> +features with EDAC bus and exposes the features control attributes via
> +the sysfs EDAC bus. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/
> +
> +3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for
> +dynamic scrub/repair control to issue scrubbing/repair when excess number
> +of corrected memory errors are reported in a short span of time.
> diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst
> new file mode 100644
> index 000000000000..b6c265a4cffb
> --- /dev/null
> +++ b/Documentation/edac/index.rst
> @@ -0,0 +1,10 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +==============
> +EDAC Subsystem
> +==============
> +
> +.. toctree::
> +   :maxdepth: 1
> +
> +   features
> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
> index 621dc2a5d034..9fce46dd7405 100644
> --- a/drivers/edac/edac_device.c
> +++ b/drivers/edac/edac_device.c
> @@ -570,3 +570,103 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
>  		      block ? block->name : "N/A", count, msg);
>  }
>  EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
> +
> +static void edac_dev_release(struct device *dev)
> +{
> +	struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev);
> +
> +	kfree(ctx->dev.groups);
> +	kfree(ctx);
> +}
> +
> +const struct device_type edac_dev_type = {
> +	.name = "edac_dev",
> +	.release = edac_dev_release,
> +};
> +
> +static void edac_dev_unreg(void *data)
> +{
> +	device_unregister(data);
> +}
> +
> +/**
> + * edac_dev_register - register device for RAS features with EDAC
> + * @parent: parent device.
> + * @name: parent device's name.
> + * @private: parent driver's data to store in the context if any.
> + * @num_features: number of RAS features to register.
> + * @ras_features: list of RAS features to register.
> + *
> + * Return:
> + *  * %0       - Success.
> + *  * %-EINVAL - Invalid parameters passed.
> + *  * %-ENOMEM - Dynamic memory allocation failed.
> + *
> + */
> +int edac_dev_register(struct device *parent, char *name,
> +		      void *private, int num_features,
> +		      const struct edac_dev_feature *ras_features)
> +{
> +	const struct attribute_group **ras_attr_groups;
> +	struct edac_dev_feat_ctx *ctx;
> +	int attr_gcnt = 0;
> +	int ret, feat;
> +
> +	if (!parent || !name || !num_features || !ras_features)
> +		return -EINVAL;
> +
> +	/* Double parse to make space for attributes */
> +	for (feat = 0; feat < num_features; feat++) {
> +		switch (ras_features[feat].ft_type) {
> +		/* Add feature specific code */
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> +	if (!ctx)
> +		return -ENOMEM;
> +
> +	ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL);
> +	if (!ras_attr_groups) {
> +		ret = -ENOMEM;
> +		goto ctx_free;
> +	}
> +
> +	attr_gcnt = 0;
> +	for (feat = 0; feat < num_features; feat++, ras_features++) {
> +		switch (ras_features->ft_type) {
> +		/* Add feature specific code */
> +		default:
> +			ret = -EINVAL;
> +			goto groups_free;
> +		}
> +	}
> +
> +	ctx->dev.parent = parent;
> +	ctx->dev.bus = edac_get_sysfs_subsys();
> +	ctx->dev.type = &edac_dev_type;
> +	ctx->dev.groups = ras_attr_groups;
> +	ctx->private = private;
> +	dev_set_drvdata(&ctx->dev, ctx);
> +
> +	ret = dev_set_name(&ctx->dev, name);
> +	if (ret)
> +		goto groups_free;
> +
> +	ret = device_register(&ctx->dev);
> +	if (ret) {
> +		put_device(&ctx->dev);

> +		return ret;

As register failed, you need to change it to a goto groups_free,
as edac_dev_release() won't be called.

> +	}
> +
> +	return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev);
> +
> +groups_free:
> +	kfree(ras_attr_groups);
> +ctx_free:
> +	kfree(ctx);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(edac_dev_register);
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index b4ee8961e623..521b17113d4d 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -661,4 +661,32 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
>  
>  	return mci->dimms[index];
>  }
> +
> +#define EDAC_FEAT_NAME_LEN	128

This macro was not used on this patch.

> +
> +/* RAS feature type */
> +enum edac_dev_feat {
> +	RAS_FEAT_MAX
> +};
> +
> +/* EDAC device feature information structure */
> +struct edac_dev_data {
> +	u8 instance;
> +	void *private;
> +};
> +
> +struct edac_dev_feat_ctx {
> +	struct device dev;
> +	void *private;
> +};
> +
> +struct edac_dev_feature {
> +	enum edac_dev_feat ft_type;
> +	u8 instance;
> +	void *ctx;
> +};
> +
> +int edac_dev_register(struct device *parent, char *dev_name,
> +		      void *parent_pvt_data, int num_features,
> +		      const struct edac_dev_feature *ras_features);
>  #endif /* _LINUX_EDAC_H_ */

Thanks,
Mauro
Jonathan Cameron Jan. 14, 2025, 9:55 a.m. UTC | #4
> > +int edac_dev_register(struct device *parent, char *name,
> > +		      void *private, int num_features,
> > +		      const struct edac_dev_feature *ras_features)
> > +{
> > +	const struct attribute_group **ras_attr_groups;
> > +	struct edac_dev_feat_ctx *ctx;
> > +	int attr_gcnt = 0;
> > +	int ret, feat;
> > +
> > +	if (!parent || !name || !num_features || !ras_features)
> > +		return -EINVAL;
> > +
> > +	/* Double parse to make space for attributes */
> > +	for (feat = 0; feat < num_features; feat++) {
> > +		switch (ras_features[feat].ft_type) {
> > +		/* Add feature specific code */
> > +		default:
> > +			return -EINVAL;
> > +		}
> > +	}
> > +
> > +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> > +	if (!ctx)
> > +		return -ENOMEM;
> > +
> > +	ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL);
> > +	if (!ras_attr_groups) {
> > +		ret = -ENOMEM;
> > +		goto ctx_free;
> > +	}
> > +
> > +	attr_gcnt = 0;
> > +	for (feat = 0; feat < num_features; feat++, ras_features++) {
> > +		switch (ras_features->ft_type) {
> > +		/* Add feature specific code */
> > +		default:
> > +			ret = -EINVAL;
> > +			goto groups_free;
> > +		}
> > +	}
> > +
> > +	ctx->dev.parent = parent;
> > +	ctx->dev.bus = edac_get_sysfs_subsys();
> > +	ctx->dev.type = &edac_dev_type;
> > +	ctx->dev.groups = ras_attr_groups;
> > +	ctx->private = private;
> > +	dev_set_drvdata(&ctx->dev, ctx);
> > +
> > +	ret = dev_set_name(&ctx->dev, name);
> > +	if (ret)
> > +		goto groups_free;
> > +
> > +	ret = device_register(&ctx->dev);
> > +	if (ret) {
> > +		put_device(&ctx->dev);  
> 
> > +		return ret;  
> 
> As register failed, you need to change it to a goto groups_free,
> as edac_dev_release() won't be called.

Boris called this one out as well, so seems it is not that well understood.
I've also tripped over this in the past and it's one of the most common
things I catch in reviews of code calling this stuff.

As discussed offline, it will be called. The device_register() docs
make it clear that whether or not that call succeeds reference counting
is enabled and put_device() is the correct way to free resources.

The actual depends on the fact that device_register() is just a helper
defined as

device_initialize();
return device_add();

So for reasons lost to history (I guess there are cases where other cleanup
needs to happen before the release) it does not handle side effects
of device_initialize() on an error in device_add().  

device_initialize() has called
-> kobject_init(&dev->kobj, &device_type);
 -> kref_init_internal(kobj) + sets ktype (which has the release callback)

kref_init_internal() sets the reference counter to 1

Hence when we do a device_put() in the error path, the reference counter drops
to 0 and the release from the ktype is called.  Here that is edac_dev_release();

If you want to verify replace device_register() with device_initialize() then
call put_device().

If we were going back in history, I'd suggest device_register() should be side
effect free and call put_device() on error and any driver that needs to handle
other stuff before the release should just not use it. I guess that ship
long sailed and maybe there are other reasons I've not thought of.

I took a quick look and seems to go back into at least the 2.5 era.

Jonathan
Shiju Jose Jan. 14, 2025, 10:08 a.m. UTC | #5
>-----Original Message-----
>From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
>Sent: 13 January 2025 15:06
>To: Shiju Jose <shiju.jose@huawei.com>
>Cc: linux-edac@vger.kernel.org; linux-cxl@vger.kernel.org; linux-
>acpi@vger.kernel.org; linux-mm@kvack.org; linux-kernel@vger.kernel.org;
>bp@alien8.de; tony.luck@intel.com; rafael@kernel.org; lenb@kernel.org;
>mchehab@kernel.org; dan.j.williams@intel.com; dave@stgolabs.net; Jonathan
>Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com;
>alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com;
>david@redhat.com; Vilas.Sridharan@amd.com; leo.duran@amd.com;
>Yazen.Ghannam@amd.com; rientjes@google.com; jiaqiyan@google.com;
>Jon.Grimm@amd.com; dave.hansen@linux.intel.com;
>naoya.horiguchi@nec.com; james.morse@arm.com; jthoughton@google.com;
>somasundaram.a@hpe.com; erdemaktas@google.com; pgonda@google.com;
>duenwen@google.com; gthelen@google.com;
>wschwartz@amperecomputing.com; dferguson@amperecomputing.com;
>wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei
><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; Roberto
>Sassu <roberto.sassu@huawei.com>; kangkang.shen@futurewei.com;
>wanghuiqiang <wanghuiqiang@huawei.com>; Linuxarm
><linuxarm@huawei.com>
>Subject: Re: [PATCH v18 01/19] EDAC: Add support for EDAC device features
>control
>
>Em Mon, 6 Jan 2025 12:09:57 +0000
><shiju.jose@huawei.com> escreveu:
>
>> From: Shiju Jose <shiju.jose@huawei.com>
>>
>> Add generic EDAC device feature controls supporting the registration
>> of RAS features available in the system. The driver exposes control
>> attributes for these features to userspace in
>> /sys/bus/edac/devices/<dev-name>/<ras-feature>/
>>
>> Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
>> ---
>>  Documentation/edac/features.rst |  94 ++++++++++++++++++++++++++++++
>>  Documentation/edac/index.rst    |  10 ++++
>>  drivers/edac/edac_device.c      | 100 ++++++++++++++++++++++++++++++++
>>  include/linux/edac.h            |  28 +++++++++
>>  4 files changed, 232 insertions(+)
>>  create mode 100644 Documentation/edac/features.rst  create mode
>> 100644 Documentation/edac/index.rst
>>
>> diff --git a/Documentation/edac/features.rst
>> b/Documentation/edac/features.rst new file mode 100644 index
>> 000000000000..f32f259ce04d
>> --- /dev/null
>> +++ b/Documentation/edac/features.rst
>> @@ -0,0 +1,94 @@
>> +.. SPDX-License-Identifier: GPL-2.0
>
>SPDX should match what's written there, e. g.
>
>	.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
>
>Please notice that GNU FDL family contains both open source and non-open
>source licenses. The open-source one is this:
>
>	https://spdx.org/licenses/GFDL-1.2-no-invariants-or-later.html
>
>E.g. it is a the license permits changing the entire document in the future, as
>there's no invariant parts on it.
This seems not widely used, have seen this is used in few documents only. 

>
>> +
>> +============================================
>> +Augmenting EDAC for controlling RAS features
>> +============================================
>> +
>> +Copyright (c) 2024 HiSilicon Limited.
>
>2024-2025?
Will do.  
>
>> +
>> +:Author:   Shiju Jose <shiju.jose@huawei.com>
>> +:License:  The GNU Free Documentation License, Version 1.2
>> +          (dual licensed under the GPL v2)
>
>You need to define if invariant parts are allowed or not, e. g.:
>
>	:License: The GNU Free Documentation License, Version 1.2 without
>Invariant Sections, Front-Cover Texts nor Back-Cover Texts.
>		  (dual licensed under the GPL v2)
Same as above.
>
>
>> +:Original Reviewers:
>> +
>> +- Written for: 6.14
>> +
>> +Introduction
>> +------------
>> +The expansion of EDAC for controlling RAS features and exposing
>> +features control attributes to userspace via sysfs. Some Examples:
>> +
>> +* Scrub control
>> +
>> +* Error Check Scrub (ECS) control
>> +
>> +* ACPI RAS2 features
>> +
>> +* Post Package Repair (PPR) control
>> +
>> +* Memory Sparing Repair control etc.
>> +
>> +High level design is illustrated in the following diagram::
>> +
>> +         _______________________________________________
>> +        |   Userspace - Rasdaemon                       |
>> +        |  _____________                                |
>> +        | | RAS CXL mem |      _______________          |
>> +        | |error handler|---->|               |         |
>> +        | |_____________|     | RAS dynamic   |         |
>> +        |  _____________      | scrub, memory |         |
>> +        | | RAS memory  |---->| repair control|         |
>> +        | |error handler|     |_______________|         |
>> +        | |_____________|          |                    |
>> +        |__________________________|____________________|
>> +                                   |
>> +                                   |
>> +
>_______________________________|______________________________
>> +   |     Kernel EDAC extension for | controlling RAS Features     |
>> +   | ______________________________|____________________________
>|
>> +   || EDAC Core          Sysfs EDAC| Bus                        | |
>> +   ||    __________________________|_________     _____________ | |
>> +   ||   |/sys/bus/edac/devices/<dev>/scrubX/ |   | EDAC device || |
>> +   ||   |/sys/bus/edac/devices/<dev>/ecsX/   |<->| EDAC MC     || |
>> +   ||   |/sys/bus/edac/devices/<dev>/repairX |   | EDAC sysfs  || |
>> +   ||   |____________________________________|   |_____________|| |
>> +   ||                           EDAC|Bus                        | |
>> +   ||                               |                           | |
>> +   ||    __________ Get feature     |      Get feature          | |
>> +   ||   |          |desc   _________|______ desc  __________    | |
>> +   ||   |EDAC scrub|<-----| EDAC device    |     |          |   | |
>> +   ||   |__________|      | driver- RAS    |---->| EDAC mem |   | |
>> +   ||    __________       | feature control|     | repair   |   | |
>> +   ||   |          |<-----|________________|     |__________|   | |
>> +   ||   |EDAC ECS  |    Register RAS|features                   | |
>> +   ||   |__________|                |                           | |
>> +   ||         ______________________|_____________              | |
>> +   ||_________|_______________|__________________|______________|
>|
>> +   |   _______|____    _______|_______       ____|__________      |
>> +   |  |            |  | CXL mem driver|     | Client driver |     |
>> +   |  | ACPI RAS2  |  | scrub, ECS,   |     | memory repair |     |
>> +   |  | driver     |  | sparing, PPR  |     | features      |     |
>> +   |  |____________|  |_______________|     |_______________|     |
>> +   |        |                 |                    |              |
>> +
>|________|_________________|____________________|______________|
>> +            |                 |                    |
>> +
>________|_________________|____________________|______________
>> +   |     ___|_________________|____________________|_______       |
>> +   |    |                                                  |      |
>> +   |    |            Platform HW and Firmware              |      |
>> +   |    |__________________________________________________|      |
>> +
>|______________________________________________________________|
>> +
>> +
>> +1. EDAC Features components - Create feature specific descriptors.
>> +For example, EDAC scrub, EDAC ECS, EDAC memory repair in the above
>> +diagram.
>> +
>> +2. EDAC device driver for controlling RAS Features - Get feature's
>> +attribute descriptors from EDAC RAS feature component and registers
>> +device's RAS features with EDAC bus and exposes the features control
>> +attributes via the sysfs EDAC bus. For example,
>> +/sys/bus/edac/devices/<dev-name>/<feature>X/
>> +
>> +3. RAS dynamic feature controller - Userspace sample modules in
>> +rasdaemon for dynamic scrub/repair control to issue scrubbing/repair
>> +when excess number of corrected memory errors are reported in a short
>span of time.
>> diff --git a/Documentation/edac/index.rst
>> b/Documentation/edac/index.rst new file mode 100644 index
>> 000000000000..b6c265a4cffb
>> --- /dev/null
>> +++ b/Documentation/edac/index.rst
>> @@ -0,0 +1,10 @@
>> +.. SPDX-License-Identifier: GPL-2.0
>> +
>> +==============
>> +EDAC Subsystem
>> +==============
>> +
>> +.. toctree::
>> +   :maxdepth: 1
>> +
>> +   features
>> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
>> index 621dc2a5d034..9fce46dd7405 100644
>> --- a/drivers/edac/edac_device.c
>> +++ b/drivers/edac/edac_device.c
>> @@ -570,3 +570,103 @@ void edac_device_handle_ue_count(struct
>edac_device_ctl_info *edac_dev,
>>  		      block ? block->name : "N/A", count, msg);  }
>> EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
>> +
>> +static void edac_dev_release(struct device *dev) {
>> +	struct edac_dev_feat_ctx *ctx = container_of(dev, struct
>> +edac_dev_feat_ctx, dev);
>> +
>> +	kfree(ctx->dev.groups);
>> +	kfree(ctx);
>> +}
>> +
>> +const struct device_type edac_dev_type = {
>> +	.name = "edac_dev",
>> +	.release = edac_dev_release,
>> +};
>> +
>> +static void edac_dev_unreg(void *data) {
>> +	device_unregister(data);
>> +}
>> +
>> +/**
>> + * edac_dev_register - register device for RAS features with EDAC
>> + * @parent: parent device.
>> + * @name: parent device's name.
>> + * @private: parent driver's data to store in the context if any.
>> + * @num_features: number of RAS features to register.
>> + * @ras_features: list of RAS features to register.
>> + *
>> + * Return:
>> + *  * %0       - Success.
>> + *  * %-EINVAL - Invalid parameters passed.
>> + *  * %-ENOMEM - Dynamic memory allocation failed.
>> + *
>> + */
>> +int edac_dev_register(struct device *parent, char *name,
>> +		      void *private, int num_features,
>> +		      const struct edac_dev_feature *ras_features) {
>> +	const struct attribute_group **ras_attr_groups;
>> +	struct edac_dev_feat_ctx *ctx;
>> +	int attr_gcnt = 0;
>> +	int ret, feat;
>> +
>> +	if (!parent || !name || !num_features || !ras_features)
>> +		return -EINVAL;
>> +
>> +	/* Double parse to make space for attributes */
>> +	for (feat = 0; feat < num_features; feat++) {
>> +		switch (ras_features[feat].ft_type) {
>> +		/* Add feature specific code */
>> +		default:
>> +			return -EINVAL;
>> +		}
>> +	}
>> +
>> +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
>> +	if (!ctx)
>> +		return -ENOMEM;
>> +
>> +	ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups),
>GFP_KERNEL);
>> +	if (!ras_attr_groups) {
>> +		ret = -ENOMEM;
>> +		goto ctx_free;
>> +	}
>> +
>> +	attr_gcnt = 0;
>> +	for (feat = 0; feat < num_features; feat++, ras_features++) {
>> +		switch (ras_features->ft_type) {
>> +		/* Add feature specific code */
>> +		default:
>> +			ret = -EINVAL;
>> +			goto groups_free;
>> +		}
>> +	}
>> +
>> +	ctx->dev.parent = parent;
>> +	ctx->dev.bus = edac_get_sysfs_subsys();
>> +	ctx->dev.type = &edac_dev_type;
>> +	ctx->dev.groups = ras_attr_groups;
>> +	ctx->private = private;
>> +	dev_set_drvdata(&ctx->dev, ctx);
>> +
>> +	ret = dev_set_name(&ctx->dev, name);
>> +	if (ret)
>> +		goto groups_free;
>> +
>> +	ret = device_register(&ctx->dev);
>> +	if (ret) {
>> +		put_device(&ctx->dev);
>
>> +		return ret;
>
>As register failed, you need to change it to a goto groups_free, as
>edac_dev_release() won't be called.
As per experimentation edac_dev_release() will be called.

>
>> +	}
>> +
>> +	return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev);
>> +
>> +groups_free:
>> +	kfree(ras_attr_groups);
>> +ctx_free:
>> +	kfree(ctx);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(edac_dev_register);
>> diff --git a/include/linux/edac.h b/include/linux/edac.h index
>> b4ee8961e623..521b17113d4d 100644
>> --- a/include/linux/edac.h
>> +++ b/include/linux/edac.h
>> @@ -661,4 +661,32 @@ static inline struct dimm_info
>> *edac_get_dimm(struct mem_ctl_info *mci,
>>
>>  	return mci->dimms[index];
>>  }
>> +
>> +#define EDAC_FEAT_NAME_LEN	128
>
>This macro was not used on this patch.
Sure.
>
>> +
>> +/* RAS feature type */
>> +enum edac_dev_feat {
>> +	RAS_FEAT_MAX
>> +};
>> +
>> +/* EDAC device feature information structure */ struct edac_dev_data
>> +{
>> +	u8 instance;
>> +	void *private;
>> +};
>> +
>> +struct edac_dev_feat_ctx {
>> +	struct device dev;
>> +	void *private;
>> +};
>> +
>> +struct edac_dev_feature {
>> +	enum edac_dev_feat ft_type;
>> +	u8 instance;
>> +	void *ctx;
>> +};
>> +
>> +int edac_dev_register(struct device *parent, char *dev_name,
>> +		      void *parent_pvt_data, int num_features,
>> +		      const struct edac_dev_feature *ras_features);
>>  #endif /* _LINUX_EDAC_H_ */
>
>Thanks,
>Mauro

Thanks,
Shiju
Mauro Carvalho Chehab Jan. 14, 2025, 11:33 a.m. UTC | #6
Em Tue, 14 Jan 2025 10:08:42 +0000
Shiju Jose <shiju.jose@huawei.com> escreveu:

> >> diff --git a/Documentation/edac/features.rst
> >> b/Documentation/edac/features.rst new file mode 100644 index
> >> 000000000000..f32f259ce04d
> >> --- /dev/null
> >> +++ b/Documentation/edac/features.rst
> >> @@ -0,0 +1,94 @@
> >> +.. SPDX-License-Identifier: GPL-2.0  
> >
> >SPDX should match what's written there, e. g.
> >
> >	.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
> >
> >Please notice that GNU FDL family contains both open source and non-open
> >source licenses. The open-source one is this:
> >
> >	https://spdx.org/licenses/GFDL-1.2-no-invariants-or-later.html
> >
> >E.g. it is a the license permits changing the entire document in the future, as
> >there's no invariant parts on it.  
> This seems not widely used, have seen this is used in few documents only. 

This was added after some discussions I had with LF people in charge
of SPDX: GFDL explicitly allows to have some parts that can't be touched
by future patches. Those are "invariant" parts of the document.

They were designed in a way that the original author's notes
can't be touched on any further patch from someone's else. 

You can see more about that at:

	https://www.gnu.org/licenses/fdl-howto-opt.en.html

See:

	"The classical example of an invariant nontechnical section in a free manual
	 is the GNU Manifesto, which is included in the GNU Emacs Manual. The GNU
	 Manifesto says nothing about how to edit with Emacs, but it explains the
	 reason why I wrote GNU Emacs"

And:
	https://www.gnu.org/gnu/manifesto.html

Due to its nature of being invariant, most people consider it as a
non-open-source license. See, for instance:

	https://www.debian.org/vote/2006/vote_001

Due to such concerns, after several discussions I had with interested parties,
this was added to SPDX spec and to the Linux Kernel:

	- GFDL-1.2-no-invariants-only - for GFDL v 1.2 only
	- GFDL-1.2-no-invariants-or-later - for GFDL v 1.2 or later

(plus variants for other GFDL versions)

You may use either one of them, but you should *not* use GFDL-1.2
as this is deprecated:

	https://spdx.org/licenses/GFDL-1.2.html

And need to be replaced by either:

	https://spdx.org/licenses/GFDL-1.2-no-invariants-or-later.html
or:
	https://spdx.org/licenses/GFDL-1.2-invariants-only.html

Thanks,
Mauro
diff mbox series

Patch

diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst
new file mode 100644
index 000000000000..f32f259ce04d
--- /dev/null
+++ b/Documentation/edac/features.rst
@@ -0,0 +1,94 @@ 
+.. SPDX-License-Identifier: GPL-2.0
+
+============================================
+Augmenting EDAC for controlling RAS features
+============================================
+
+Copyright (c) 2024 HiSilicon Limited.
+
+:Author:   Shiju Jose <shiju.jose@huawei.com>
+:License:  The GNU Free Documentation License, Version 1.2
+          (dual licensed under the GPL v2)
+:Original Reviewers:
+
+- Written for: 6.14
+
+Introduction
+------------
+The expansion of EDAC for controlling RAS features and exposing features
+control attributes to userspace via sysfs. Some Examples:
+
+* Scrub control
+
+* Error Check Scrub (ECS) control
+
+* ACPI RAS2 features
+
+* Post Package Repair (PPR) control
+
+* Memory Sparing Repair control etc.
+
+High level design is illustrated in the following diagram::
+
+         _______________________________________________
+        |   Userspace - Rasdaemon                       |
+        |  _____________                                |
+        | | RAS CXL mem |      _______________          |
+        | |error handler|---->|               |         |
+        | |_____________|     | RAS dynamic   |         |
+        |  _____________      | scrub, memory |         |
+        | | RAS memory  |---->| repair control|         |
+        | |error handler|     |_______________|         |
+        | |_____________|          |                    |
+        |__________________________|____________________|
+                                   |
+                                   |
+    _______________________________|______________________________
+   |     Kernel EDAC extension for | controlling RAS Features     |
+   | ______________________________|____________________________  |
+   || EDAC Core          Sysfs EDAC| Bus                        | |
+   ||    __________________________|_________     _____________ | |
+   ||   |/sys/bus/edac/devices/<dev>/scrubX/ |   | EDAC device || |
+   ||   |/sys/bus/edac/devices/<dev>/ecsX/   |<->| EDAC MC     || |
+   ||   |/sys/bus/edac/devices/<dev>/repairX |   | EDAC sysfs  || |
+   ||   |____________________________________|   |_____________|| |
+   ||                           EDAC|Bus                        | |
+   ||                               |                           | |
+   ||    __________ Get feature     |      Get feature          | |
+   ||   |          |desc   _________|______ desc  __________    | |
+   ||   |EDAC scrub|<-----| EDAC device    |     |          |   | |
+   ||   |__________|      | driver- RAS    |---->| EDAC mem |   | |
+   ||    __________       | feature control|     | repair   |   | |
+   ||   |          |<-----|________________|     |__________|   | |
+   ||   |EDAC ECS  |    Register RAS|features                   | |
+   ||   |__________|                |                           | |
+   ||         ______________________|_____________              | |
+   ||_________|_______________|__________________|______________| |
+   |   _______|____    _______|_______       ____|__________      |
+   |  |            |  | CXL mem driver|     | Client driver |     |
+   |  | ACPI RAS2  |  | scrub, ECS,   |     | memory repair |     |
+   |  | driver     |  | sparing, PPR  |     | features      |     |
+   |  |____________|  |_______________|     |_______________|     |
+   |        |                 |                    |              |
+   |________|_________________|____________________|______________|
+            |                 |                    |
+    ________|_________________|____________________|______________
+   |     ___|_________________|____________________|_______       |
+   |    |                                                  |      |
+   |    |            Platform HW and Firmware              |      |
+   |    |__________________________________________________|      |
+   |______________________________________________________________|
+
+
+1. EDAC Features components - Create feature specific descriptors.
+For example, EDAC scrub, EDAC ECS, EDAC memory repair in the above
+diagram.
+
+2. EDAC device driver for controlling RAS Features - Get feature's attribute
+descriptors from EDAC RAS feature component and registers device's RAS
+features with EDAC bus and exposes the features control attributes via
+the sysfs EDAC bus. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/
+
+3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for
+dynamic scrub/repair control to issue scrubbing/repair when excess number
+of corrected memory errors are reported in a short span of time.
diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst
new file mode 100644
index 000000000000..b6c265a4cffb
--- /dev/null
+++ b/Documentation/edac/index.rst
@@ -0,0 +1,10 @@ 
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+EDAC Subsystem
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   features
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index 621dc2a5d034..9fce46dd7405 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -570,3 +570,103 @@  void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
 		      block ? block->name : "N/A", count, msg);
 }
 EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
+
+static void edac_dev_release(struct device *dev)
+{
+	struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev);
+
+	kfree(ctx->dev.groups);
+	kfree(ctx);
+}
+
+const struct device_type edac_dev_type = {
+	.name = "edac_dev",
+	.release = edac_dev_release,
+};
+
+static void edac_dev_unreg(void *data)
+{
+	device_unregister(data);
+}
+
+/**
+ * edac_dev_register - register device for RAS features with EDAC
+ * @parent: parent device.
+ * @name: parent device's name.
+ * @private: parent driver's data to store in the context if any.
+ * @num_features: number of RAS features to register.
+ * @ras_features: list of RAS features to register.
+ *
+ * Return:
+ *  * %0       - Success.
+ *  * %-EINVAL - Invalid parameters passed.
+ *  * %-ENOMEM - Dynamic memory allocation failed.
+ *
+ */
+int edac_dev_register(struct device *parent, char *name,
+		      void *private, int num_features,
+		      const struct edac_dev_feature *ras_features)
+{
+	const struct attribute_group **ras_attr_groups;
+	struct edac_dev_feat_ctx *ctx;
+	int attr_gcnt = 0;
+	int ret, feat;
+
+	if (!parent || !name || !num_features || !ras_features)
+		return -EINVAL;
+
+	/* Double parse to make space for attributes */
+	for (feat = 0; feat < num_features; feat++) {
+		switch (ras_features[feat].ft_type) {
+		/* Add feature specific code */
+		default:
+			return -EINVAL;
+		}
+	}
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL);
+	if (!ras_attr_groups) {
+		ret = -ENOMEM;
+		goto ctx_free;
+	}
+
+	attr_gcnt = 0;
+	for (feat = 0; feat < num_features; feat++, ras_features++) {
+		switch (ras_features->ft_type) {
+		/* Add feature specific code */
+		default:
+			ret = -EINVAL;
+			goto groups_free;
+		}
+	}
+
+	ctx->dev.parent = parent;
+	ctx->dev.bus = edac_get_sysfs_subsys();
+	ctx->dev.type = &edac_dev_type;
+	ctx->dev.groups = ras_attr_groups;
+	ctx->private = private;
+	dev_set_drvdata(&ctx->dev, ctx);
+
+	ret = dev_set_name(&ctx->dev, name);
+	if (ret)
+		goto groups_free;
+
+	ret = device_register(&ctx->dev);
+	if (ret) {
+		put_device(&ctx->dev);
+		return ret;
+	}
+
+	return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev);
+
+groups_free:
+	kfree(ras_attr_groups);
+ctx_free:
+	kfree(ctx);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(edac_dev_register);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index b4ee8961e623..521b17113d4d 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -661,4 +661,32 @@  static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
 
 	return mci->dimms[index];
 }
+
+#define EDAC_FEAT_NAME_LEN	128
+
+/* RAS feature type */
+enum edac_dev_feat {
+	RAS_FEAT_MAX
+};
+
+/* EDAC device feature information structure */
+struct edac_dev_data {
+	u8 instance;
+	void *private;
+};
+
+struct edac_dev_feat_ctx {
+	struct device dev;
+	void *private;
+};
+
+struct edac_dev_feature {
+	enum edac_dev_feat ft_type;
+	u8 instance;
+	void *ctx;
+};
+
+int edac_dev_register(struct device *parent, char *dev_name,
+		      void *parent_pvt_data, int num_features,
+		      const struct edac_dev_feature *ras_features);
 #endif /* _LINUX_EDAC_H_ */