diff mbox series

[v20,01/15] EDAC: Add support for EDAC device features control

Message ID 20250212143654.1893-2-shiju.jose@huawei.com (mailing list archive)
State New
Headers show
Series EDAC: Scrub: introduce generic EDAC RAS control feature driver + CXL/ACPI-RAS2 drivers | expand

Commit Message

Shiju Jose Feb. 12, 2025, 2:36 p.m. UTC
From: Shiju Jose <shiju.jose@huawei.com>

Add generic EDAC device feature controls supporting the registration
of RAS features available in the system. The driver exposes control
attributes for these features to userspace in
/sys/bus/edac/devices/<dev-name>/<ras-feature>/

Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Daniel Ferguson <danielf@os.amperecomputing.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 Documentation/edac/features.rst |  94 +++++++++++++++++++++++++++++
 Documentation/edac/index.rst    |  10 ++++
 drivers/edac/edac_device.c      | 102 ++++++++++++++++++++++++++++++++
 include/linux/edac.h            |  26 ++++++++
 4 files changed, 232 insertions(+)
 create mode 100644 Documentation/edac/features.rst
 create mode 100644 Documentation/edac/index.rst

Comments

Fan Ni Feb. 13, 2025, 9:06 p.m. UTC | #1
On Wed, Feb 12, 2025 at 02:36:39PM +0000, shiju.jose@huawei.com wrote:
> From: Shiju Jose <shiju.jose@huawei.com>
> 
> Add generic EDAC device feature controls supporting the registration
> of RAS features available in the system. The driver exposes control
> attributes for these features to userspace in
> /sys/bus/edac/devices/<dev-name>/<ras-feature>/
> 
> Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Tested-by: Daniel Ferguson <danielf@os.amperecomputing.com>
> Signed-off-by: Shiju Jose <shiju.jose@huawei.com>

Reviewed-by: Fan Ni <fan.ni@samsung.com>
Tested-by: Fan Ni <fan.ni@samsung.com>

> ---
>  Documentation/edac/features.rst |  94 +++++++++++++++++++++++++++++
>  Documentation/edac/index.rst    |  10 ++++
>  drivers/edac/edac_device.c      | 102 ++++++++++++++++++++++++++++++++
>  include/linux/edac.h            |  26 ++++++++
>  4 files changed, 232 insertions(+)
>  create mode 100644 Documentation/edac/features.rst
>  create mode 100644 Documentation/edac/index.rst
> 
> diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst
> new file mode 100644
> index 000000000000..6b0fdc6f5d6e
> --- /dev/null
> +++ b/Documentation/edac/features.rst
> @@ -0,0 +1,94 @@
> +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
> +
> +============================================
> +Augmenting EDAC for controlling RAS features
> +============================================
> +
> +Copyright (c) 2024-2025 HiSilicon Limited.
> +
> +:Author:   Shiju Jose <shiju.jose@huawei.com>
> +:License:  The GNU Free Documentation License, Version 1.2 without
> +           Invariant Sections, Front-Cover Texts nor Back-Cover Texts.
> +           (dual licensed under the GPL v2)
> +
> +- Written for: 6.15
> +
> +Introduction
> +------------
> +The expansion of EDAC for controlling RAS features and exposing features
> +control attributes to userspace via sysfs. Some Examples:
> +
> +1. Scrub control
> +
> +2. Error Check Scrub (ECS) control
> +
> +3. ACPI RAS2 features
> +
> +4. Post Package Repair (PPR) control
> +
> +5. Memory Sparing Repair control etc.
> +
> +High level design is illustrated in the following diagram::
> +
> +        +-----------------------------------------------+
> +        |   Userspace - Rasdaemon                       |
> +        | +-------------+                               |
> +        | | RAS CXL mem |     +---------------+         |
> +        | |error handler|---->|               |         |
> +        | +-------------+     | RAS dynamic   |         |
> +        | +-------------+     | scrub, memory |         |
> +        | | RAS memory  |---->| repair control|         |
> +        | |error handler|     +----|----------+         |
> +        | +-------------+          |                    |
> +        +--------------------------|--------------------+
> +                                   |
> +                                   |
> +   +-------------------------------|------------------------------+
> +   |     Kernel EDAC extension for | controlling RAS Features     |
> +   |+------------------------------|----------------------------+ |
> +   || EDAC Core          Sysfs EDAC| Bus                        | |
> +   ||   +--------------------------|---------------------------+| |
> +   ||   |/sys/bus/edac/devices/<dev>/scrubX/ |   | EDAC device || |
> +   ||   |/sys/bus/edac/devices/<dev>/ecsX/   |<->| EDAC MC     || |
> +   ||   |/sys/bus/edac/devices/<dev>/repairX |   | EDAC sysfs  || |
> +   ||   +---------------------------|--------------------------+| |
> +   ||                           EDAC|Bus                        | |
> +   ||                               |                           | |
> +   ||   +----------+ Get feature    |      Get feature          | |
> +   ||   |          | desc +---------|------+ desc +----------+  | |
> +   ||   |EDAC scrub|<-----| EDAC device    |      |          |  | |
> +   ||   +----------+      | driver- RAS    |----->| EDAC mem |  | |
> +   ||   +----------+      | feature control|      | repair   |  | |
> +   ||   |          |<-----|                |      +----------+  | |
> +   ||   |EDAC ECS  |      +---------|------+                    | |
> +   ||   +----------+    Register RAS|features                   | |
> +   ||         ______________________|_____________              | |
> +   |+---------|---------------|------------------|--------------+ |
> +   |  +-------|----+  +-------|-------+     +----|----------+     |
> +   |  |            |  | CXL mem driver|     | Client driver |     |
> +   |  | ACPI RAS2  |  | scrub, ECS,   |     | memory repair |     |
> +   |  | driver     |  | sparing, PPR  |     | features      |     |
> +   |  +-----|------+  +-------|-------+     +------|--------+     |
> +   |        |                 |                    |              |
> +   +--------|-----------------|--------------------|--------------+
> +            |                 |                    |
> +   +--------|-----------------|--------------------|--------------+
> +   |    +---|-----------------|--------------------|-------+      |
> +   |    |                                                  |      |
> +   |    |            Platform HW and Firmware              |      |
> +   |    +--------------------------------------------------+      |
> +   +--------------------------------------------------------------+
> +
> +
> +1. EDAC Features components - Create feature specific descriptors.
> +   For example, EDAC scrub, EDAC ECS, EDAC memory repair in the above
> +   diagram.
> +
> +2. EDAC device driver for controlling RAS Features - Get feature's attribute
> +   descriptors from EDAC RAS feature component and registers device's RAS
> +   features with EDAC bus and exposes the features control attributes via
> +   the sysfs EDAC bus. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/
> +
> +3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for
> +   dynamic scrub/repair control to issue scrubbing/repair when excess number
> +   of corrected memory errors are reported in a short span of time.
> diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst
> new file mode 100644
> index 000000000000..de4a3aa452cb
> --- /dev/null
> +++ b/Documentation/edac/index.rst
> @@ -0,0 +1,10 @@
> +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
> +
> +==============
> +EDAC Subsystem
> +==============
> +
> +.. toctree::
> +   :maxdepth: 1
> +
> +   features
> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
> index 621dc2a5d034..142a661ff543 100644
> --- a/drivers/edac/edac_device.c
> +++ b/drivers/edac/edac_device.c
> @@ -570,3 +570,105 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
>  		      block ? block->name : "N/A", count, msg);
>  }
>  EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
> +
> +static void edac_dev_release(struct device *dev)
> +{
> +	struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev);
> +
> +	kfree(ctx->dev.groups);
> +	kfree(ctx);
> +}
> +
> +const struct device_type edac_dev_type = {
> +	.name = "edac_dev",
> +	.release = edac_dev_release,
> +};
> +
> +static void edac_dev_unreg(void *data)
> +{
> +	device_unregister(data);
> +}
> +
> +/**
> + * edac_dev_register - register device for RAS features with EDAC
> + * @parent: parent device.
> + * @name: name for the folder in the /sys/bus/edac/devices/,
> + *	  which is derived from the parent device.
> + *	  For eg. /sys/bus/edac/devices/cxl_mem0/
> + * @private: parent driver's data to store in the context if any.
> + * @num_features: number of RAS features to register.
> + * @ras_features: list of RAS features to register.
> + *
> + * Return:
> + *  * %0       - Success.
> + *  * %-EINVAL - Invalid parameters passed.
> + *  * %-ENOMEM - Dynamic memory allocation failed.
> + *
> + */
> +int edac_dev_register(struct device *parent, char *name,
> +		      void *private, int num_features,
> +		      const struct edac_dev_feature *ras_features)
> +{
> +	const struct attribute_group **ras_attr_groups;
> +	struct edac_dev_feat_ctx *ctx;
> +	int attr_gcnt = 0;
> +	int ret, feat;
> +
> +	if (!parent || !name || !num_features || !ras_features)
> +		return -EINVAL;
> +
> +	/* Double parse to make space for attributes */
> +	for (feat = 0; feat < num_features; feat++) {
> +		switch (ras_features[feat].ft_type) {
> +		/* Add feature specific code */
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> +	if (!ctx)
> +		return -ENOMEM;
> +
> +	ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL);
> +	if (!ras_attr_groups) {
> +		ret = -ENOMEM;
> +		goto ctx_free;
> +	}
> +
> +	attr_gcnt = 0;
> +	for (feat = 0; feat < num_features; feat++, ras_features++) {
> +		switch (ras_features->ft_type) {
> +		/* Add feature specific code */
> +		default:
> +			ret = -EINVAL;
> +			goto groups_free;
> +		}
> +	}
> +
> +	ctx->dev.parent = parent;
> +	ctx->dev.bus = edac_get_sysfs_subsys();
> +	ctx->dev.type = &edac_dev_type;
> +	ctx->dev.groups = ras_attr_groups;
> +	ctx->private = private;
> +	dev_set_drvdata(&ctx->dev, ctx);
> +
> +	ret = dev_set_name(&ctx->dev, name);
> +	if (ret)
> +		goto groups_free;
> +
> +	ret = device_register(&ctx->dev);
> +	if (ret) {
> +		put_device(&ctx->dev);
> +		return ret;
> +	}
> +
> +	return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev);
> +
> +groups_free:
> +	kfree(ras_attr_groups);
> +ctx_free:
> +	kfree(ctx);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(edac_dev_register);
> diff --git a/include/linux/edac.h b/include/linux/edac.h
> index b4ee8961e623..8c4b6ca2a994 100644
> --- a/include/linux/edac.h
> +++ b/include/linux/edac.h
> @@ -661,4 +661,30 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
>  
>  	return mci->dimms[index];
>  }
> +
> +/* RAS feature type */
> +enum edac_dev_feat {
> +	RAS_FEAT_MAX
> +};
> +
> +/* EDAC device feature information structure */
> +struct edac_dev_data {
> +	u8 instance;
> +	void *private;
> +};
> +
> +struct edac_dev_feat_ctx {
> +	struct device dev;
> +	void *private;
> +};
> +
> +struct edac_dev_feature {
> +	enum edac_dev_feat ft_type;
> +	u8 instance;
> +	void *ctx;
> +};
> +
> +int edac_dev_register(struct device *parent, char *dev_name,
> +		      void *parent_pvt_data, int num_features,
> +		      const struct edac_dev_feature *ras_features);
>  #endif /* _LINUX_EDAC_H_ */
> -- 
> 2.43.0
>
diff mbox series

Patch

diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst
new file mode 100644
index 000000000000..6b0fdc6f5d6e
--- /dev/null
+++ b/Documentation/edac/features.rst
@@ -0,0 +1,94 @@ 
+.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
+
+============================================
+Augmenting EDAC for controlling RAS features
+============================================
+
+Copyright (c) 2024-2025 HiSilicon Limited.
+
+:Author:   Shiju Jose <shiju.jose@huawei.com>
+:License:  The GNU Free Documentation License, Version 1.2 without
+           Invariant Sections, Front-Cover Texts nor Back-Cover Texts.
+           (dual licensed under the GPL v2)
+
+- Written for: 6.15
+
+Introduction
+------------
+The expansion of EDAC for controlling RAS features and exposing features
+control attributes to userspace via sysfs. Some Examples:
+
+1. Scrub control
+
+2. Error Check Scrub (ECS) control
+
+3. ACPI RAS2 features
+
+4. Post Package Repair (PPR) control
+
+5. Memory Sparing Repair control etc.
+
+High level design is illustrated in the following diagram::
+
+        +-----------------------------------------------+
+        |   Userspace - Rasdaemon                       |
+        | +-------------+                               |
+        | | RAS CXL mem |     +---------------+         |
+        | |error handler|---->|               |         |
+        | +-------------+     | RAS dynamic   |         |
+        | +-------------+     | scrub, memory |         |
+        | | RAS memory  |---->| repair control|         |
+        | |error handler|     +----|----------+         |
+        | +-------------+          |                    |
+        +--------------------------|--------------------+
+                                   |
+                                   |
+   +-------------------------------|------------------------------+
+   |     Kernel EDAC extension for | controlling RAS Features     |
+   |+------------------------------|----------------------------+ |
+   || EDAC Core          Sysfs EDAC| Bus                        | |
+   ||   +--------------------------|---------------------------+| |
+   ||   |/sys/bus/edac/devices/<dev>/scrubX/ |   | EDAC device || |
+   ||   |/sys/bus/edac/devices/<dev>/ecsX/   |<->| EDAC MC     || |
+   ||   |/sys/bus/edac/devices/<dev>/repairX |   | EDAC sysfs  || |
+   ||   +---------------------------|--------------------------+| |
+   ||                           EDAC|Bus                        | |
+   ||                               |                           | |
+   ||   +----------+ Get feature    |      Get feature          | |
+   ||   |          | desc +---------|------+ desc +----------+  | |
+   ||   |EDAC scrub|<-----| EDAC device    |      |          |  | |
+   ||   +----------+      | driver- RAS    |----->| EDAC mem |  | |
+   ||   +----------+      | feature control|      | repair   |  | |
+   ||   |          |<-----|                |      +----------+  | |
+   ||   |EDAC ECS  |      +---------|------+                    | |
+   ||   +----------+    Register RAS|features                   | |
+   ||         ______________________|_____________              | |
+   |+---------|---------------|------------------|--------------+ |
+   |  +-------|----+  +-------|-------+     +----|----------+     |
+   |  |            |  | CXL mem driver|     | Client driver |     |
+   |  | ACPI RAS2  |  | scrub, ECS,   |     | memory repair |     |
+   |  | driver     |  | sparing, PPR  |     | features      |     |
+   |  +-----|------+  +-------|-------+     +------|--------+     |
+   |        |                 |                    |              |
+   +--------|-----------------|--------------------|--------------+
+            |                 |                    |
+   +--------|-----------------|--------------------|--------------+
+   |    +---|-----------------|--------------------|-------+      |
+   |    |                                                  |      |
+   |    |            Platform HW and Firmware              |      |
+   |    +--------------------------------------------------+      |
+   +--------------------------------------------------------------+
+
+
+1. EDAC Features components - Create feature specific descriptors.
+   For example, EDAC scrub, EDAC ECS, EDAC memory repair in the above
+   diagram.
+
+2. EDAC device driver for controlling RAS Features - Get feature's attribute
+   descriptors from EDAC RAS feature component and registers device's RAS
+   features with EDAC bus and exposes the features control attributes via
+   the sysfs EDAC bus. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/
+
+3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for
+   dynamic scrub/repair control to issue scrubbing/repair when excess number
+   of corrected memory errors are reported in a short span of time.
diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst
new file mode 100644
index 000000000000..de4a3aa452cb
--- /dev/null
+++ b/Documentation/edac/index.rst
@@ -0,0 +1,10 @@ 
+.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
+
+==============
+EDAC Subsystem
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   features
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index 621dc2a5d034..142a661ff543 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -570,3 +570,105 @@  void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
 		      block ? block->name : "N/A", count, msg);
 }
 EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
+
+static void edac_dev_release(struct device *dev)
+{
+	struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev);
+
+	kfree(ctx->dev.groups);
+	kfree(ctx);
+}
+
+const struct device_type edac_dev_type = {
+	.name = "edac_dev",
+	.release = edac_dev_release,
+};
+
+static void edac_dev_unreg(void *data)
+{
+	device_unregister(data);
+}
+
+/**
+ * edac_dev_register - register device for RAS features with EDAC
+ * @parent: parent device.
+ * @name: name for the folder in the /sys/bus/edac/devices/,
+ *	  which is derived from the parent device.
+ *	  For eg. /sys/bus/edac/devices/cxl_mem0/
+ * @private: parent driver's data to store in the context if any.
+ * @num_features: number of RAS features to register.
+ * @ras_features: list of RAS features to register.
+ *
+ * Return:
+ *  * %0       - Success.
+ *  * %-EINVAL - Invalid parameters passed.
+ *  * %-ENOMEM - Dynamic memory allocation failed.
+ *
+ */
+int edac_dev_register(struct device *parent, char *name,
+		      void *private, int num_features,
+		      const struct edac_dev_feature *ras_features)
+{
+	const struct attribute_group **ras_attr_groups;
+	struct edac_dev_feat_ctx *ctx;
+	int attr_gcnt = 0;
+	int ret, feat;
+
+	if (!parent || !name || !num_features || !ras_features)
+		return -EINVAL;
+
+	/* Double parse to make space for attributes */
+	for (feat = 0; feat < num_features; feat++) {
+		switch (ras_features[feat].ft_type) {
+		/* Add feature specific code */
+		default:
+			return -EINVAL;
+		}
+	}
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL);
+	if (!ras_attr_groups) {
+		ret = -ENOMEM;
+		goto ctx_free;
+	}
+
+	attr_gcnt = 0;
+	for (feat = 0; feat < num_features; feat++, ras_features++) {
+		switch (ras_features->ft_type) {
+		/* Add feature specific code */
+		default:
+			ret = -EINVAL;
+			goto groups_free;
+		}
+	}
+
+	ctx->dev.parent = parent;
+	ctx->dev.bus = edac_get_sysfs_subsys();
+	ctx->dev.type = &edac_dev_type;
+	ctx->dev.groups = ras_attr_groups;
+	ctx->private = private;
+	dev_set_drvdata(&ctx->dev, ctx);
+
+	ret = dev_set_name(&ctx->dev, name);
+	if (ret)
+		goto groups_free;
+
+	ret = device_register(&ctx->dev);
+	if (ret) {
+		put_device(&ctx->dev);
+		return ret;
+	}
+
+	return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev);
+
+groups_free:
+	kfree(ras_attr_groups);
+ctx_free:
+	kfree(ctx);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(edac_dev_register);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index b4ee8961e623..8c4b6ca2a994 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -661,4 +661,30 @@  static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
 
 	return mci->dimms[index];
 }
+
+/* RAS feature type */
+enum edac_dev_feat {
+	RAS_FEAT_MAX
+};
+
+/* EDAC device feature information structure */
+struct edac_dev_data {
+	u8 instance;
+	void *private;
+};
+
+struct edac_dev_feat_ctx {
+	struct device dev;
+	void *private;
+};
+
+struct edac_dev_feature {
+	enum edac_dev_feat ft_type;
+	u8 instance;
+	void *ctx;
+};
+
+int edac_dev_register(struct device *parent, char *dev_name,
+		      void *parent_pvt_data, int num_features,
+		      const struct edac_dev_feature *ras_features);
 #endif /* _LINUX_EDAC_H_ */