Message ID | 20240419164720.1765-2-shiju.jose@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ras: scrub: introduce subsystem + CXL/ACPI-RAS2 drivers | expand |
On Sat, Apr 20, 2024 at 12:47:10AM +0800, shiju.jose@huawei.com wrote: > From: Shiju Jose <shiju.jose@huawei.com> > > Add scrub subsystem supports configuring the memory scrubbers > in the system. The scrub subsystem provides the interface for > registering the scrub devices. The scrub control attributes > are provided to the user in /sys/class/ras/rasX/scrub > > Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Signed-off-by: Shiju Jose <shiju.jose@huawei.com> > --- > .../ABI/testing/sysfs-class-scrub-configure | 47 +++ > drivers/ras/Kconfig | 7 + > drivers/ras/Makefile | 1 + > drivers/ras/memory_scrub.c | 271 ++++++++++++++++++ > include/linux/memory_scrub.h | 37 +++ > 5 files changed, 363 insertions(+) > create mode 100644 Documentation/ABI/testing/sysfs-class-scrub-configure > create mode 100755 drivers/ras/memory_scrub.c > create mode 100755 include/linux/memory_scrub.h > > diff --git a/Documentation/ABI/testing/sysfs-class-scrub-configure b/Documentation/ABI/testing/sysfs-class-scrub-configure > new file mode 100644 > index 000000000000..3ed77dbb00ad > --- /dev/null > +++ b/Documentation/ABI/testing/sysfs-class-scrub-configure > @@ -0,0 +1,47 @@ > +What: /sys/class/ras/ > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + The ras/ class subdirectory belongs to the > + common ras features such as scrub subsystem. > + > +What: /sys/class/ras/rasX/scrub/ > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + The /sys/class/ras/ras{0,1,2,3,...}/scrub directories > + correspond to each scrub device registered with the > + scrub subsystem. > + > +What: /sys/class/ras/rasX/scrub/name > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RO) name of the memory scrubber > + > +What: /sys/class/ras/rasX/scrub/enable_background > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RW) Enable/Disable background(patrol) scrubbing if supported. > + > +What: /sys/class/ras/rasX/scrub/rate_available > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RO) Supported range for the scrub rate by the scrubber. > + The scrub rate represents in hours. > + > +What: /sys/class/ras/rasX/scrub/rate > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RW) The scrub rate specified and it must be with in the > + supported range by the scrubber. > + The scrub rate represents in hours. > diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig > index fc4f4bb94a4c..181701479564 100644 > --- a/drivers/ras/Kconfig > +++ b/drivers/ras/Kconfig > @@ -46,4 +46,11 @@ config RAS_FMPM > Memory will be retired during boot time and run time depending on > platform-specific policies. > > +config SCRUB > + tristate "Memory scrub driver" > + help > + This option selects the memory scrub subsystem, supports > + configuring the parameters of underlying scrubbers in the > + system for the DRAM memories. > + > endif > diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile > index 11f95d59d397..89bcf0d84355 100644 > --- a/drivers/ras/Makefile > +++ b/drivers/ras/Makefile > @@ -2,6 +2,7 @@ > obj-$(CONFIG_RAS) += ras.o > obj-$(CONFIG_DEBUG_FS) += debugfs.o > obj-$(CONFIG_RAS_CEC) += cec.o > +obj-$(CONFIG_SCRUB) += memory_scrub.o > > obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o > obj-y += amd/atl/ > diff --git a/drivers/ras/memory_scrub.c b/drivers/ras/memory_scrub.c > new file mode 100755 > index 000000000000..7e995380ec3a > --- /dev/null > +++ b/drivers/ras/memory_scrub.c > @@ -0,0 +1,271 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Memory scrub subsystem supports configuring the registered > + * memory scrubbers. > + * > + * Copyright (c) 2024 HiSilicon Limited. > + */ > + > +#define pr_fmt(fmt) "MEM SCRUB: " fmt > + > +#include <linux/acpi.h> > +#include <linux/bitops.h> > +#include <linux/delay.h> > +#include <linux/kfifo.h> > +#include <linux/memory_scrub.h> > +#include <linux/platform_device.h> > +#include <linux/spinlock.h> > + > +/* memory scrubber config definitions */ > +#define SCRUB_ID_PREFIX "ras" > +#define SCRUB_ID_FORMAT SCRUB_ID_PREFIX "%d" > + > +static DEFINE_IDA(scrub_ida); > + > +struct scrub_device { > + int id; > + struct device dev; > + const struct scrub_ops *ops; > +}; > + > +#define to_scrub_device(d) container_of(d, struct scrub_device, dev) > +static ssize_t enable_background_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t len) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + bool enable; > + int ret; > + > + ret = kstrtobool(buf, &enable); > + if (ret < 0) > + return ret; > + > + ret = scrub_dev->ops->set_enabled_bg(dev, enable); > + if (ret) > + return ret; > + > + return len; > +} > + > +static ssize_t enable_background_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + bool enable; > + int ret; > + > + ret = scrub_dev->ops->get_enabled_bg(dev, &enable); > + if (ret) > + return ret; > + > + return sysfs_emit(buf, "%d\n", enable); > +} > + > +static ssize_t name_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + int ret; > + > + ret = scrub_dev->ops->get_name(dev, buf); > + if (ret) > + return ret; > + > + return strlen(buf); > +} > + > +static ssize_t rate_show(struct device *dev, struct device_attribute *attr, > + char *buf) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + u64 val; > + int ret; > + > + ret = scrub_dev->ops->rate_read(dev, &val); > + if (ret) > + return ret; > + > + return sysfs_emit(buf, "0x%llx\n", val); > +} > + > +static ssize_t rate_store(struct device *dev, struct device_attribute *attr, > + const char *buf, size_t len) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + long val; > + int ret; > + > + ret = kstrtol(buf, 10, &val); > + if (ret < 0) > + return ret; > + > + ret = scrub_dev->ops->rate_write(dev, val); > + if (ret) > + return ret; > + > + return len; > +} > + > +static ssize_t rate_available_show(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + u64 min_sr, max_sr; > + int ret; > + > + ret = scrub_dev->ops->rate_avail_range(dev, &min_sr, &max_sr); > + if (ret) > + return ret; > + > + return sysfs_emit(buf, "0x%llx-0x%llx\n", min_sr, max_sr); > +} > + > +DEVICE_ATTR_RW(enable_background); > +DEVICE_ATTR_RO(name); > +DEVICE_ATTR_RW(rate); > +DEVICE_ATTR_RO(rate_available); > + > +static struct attribute *scrub_attrs[] = { > + &dev_attr_enable_background.attr, > + &dev_attr_name.attr, > + &dev_attr_rate.attr, > + &dev_attr_rate_available.attr, > + NULL > +}; > + > +static umode_t scrub_attr_visible(struct kobject *kobj, > + struct attribute *a, int attr_id) > +{ > + struct device *dev = kobj_to_dev(kobj); > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + const struct scrub_ops *ops = scrub_dev->ops; > + > + if (a == &dev_attr_enable_background.attr) { > + if (ops->set_enabled_bg && ops->get_enabled_bg) > + return a->mode; > + if (ops->get_enabled_bg) > + return 0444; > + return 0; > + } > + if (a == &dev_attr_name.attr) > + return ops->get_name ? a->mode : 0; > + if (a == &dev_attr_rate_available.attr) > + return ops->rate_avail_range ? a->mode : 0; > + if (a == &dev_attr_rate.attr) { /* Write only makes little sense */ > + if (ops->rate_read && ops->rate_write) > + return a->mode; > + if (ops->rate_read) > + return 0444; > + return 0; > + } > + > + return 0; > +} > + > +static const struct attribute_group scrub_attr_group = { > + .name = "scrub", > + .attrs = scrub_attrs, > + .is_visible = scrub_attr_visible, > +}; > + > +static const struct attribute_group *scrub_attr_groups[] = { > + &scrub_attr_group, > + NULL > +}; > + > +static void scrub_dev_release(struct device *dev) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + > + ida_free(&scrub_ida, scrub_dev->id); > + kfree(scrub_dev); > +} > + > +static struct class scrub_class = { > + .name = "ras", > + .dev_groups = scrub_attr_groups, > + .dev_release = scrub_dev_release, > +}; > + > +static struct device * > +scrub_device_register(struct device *parent, void *drvdata, > + const struct scrub_ops *ops) > +{ > + struct scrub_device *scrub_dev; > + struct device *hdev; > + int err; > + > + scrub_dev = kzalloc(sizeof(*scrub_dev), GFP_KERNEL); > + if (!scrub_dev) > + return ERR_PTR(-ENOMEM); > + hdev = &scrub_dev->dev; > + > + scrub_dev->id = ida_alloc(&scrub_ida, GFP_KERNEL); > + if (scrub_dev->id < 0) { > + kfree(scrub_dev); > + return ERR_PTR(-ENOMEM); > + } > + > + scrub_dev->ops = ops; > + hdev->class = &scrub_class; > + hdev->parent = parent; > + dev_set_drvdata(hdev, drvdata); > + dev_set_name(hdev, SCRUB_ID_FORMAT, scrub_dev->id); Need to check the return value of dev_set_name? fan > + err = device_register(hdev); > + if (err) { > + put_device(hdev); > + return ERR_PTR(err); > + } > + > + return hdev; > +} > + > +static void devm_scrub_release(void *dev) > +{ > + device_unregister(dev); > +} > + > +/** > + * devm_scrub_device_register - register scrubber device > + * @dev: the parent device > + * @drvdata: driver data to attach to the scrub device > + * @ops: pointer to scrub_ops structure (optional) > + * > + * Returns the pointer to the new device on success, ERR_PTR() otherwise. > + * The new device would be automatically unregistered with the parent device. > + */ > +struct device * > +devm_scrub_device_register(struct device *dev, void *drvdata, > + const struct scrub_ops *ops) > +{ > + struct device *hdev; > + int ret; > + > + if (!dev) > + return ERR_PTR(-EINVAL); > + > + hdev = scrub_device_register(dev, drvdata, ops); > + if (IS_ERR(hdev)) > + return hdev; > + > + ret = devm_add_action_or_reset(dev, devm_scrub_release, hdev); > + if (ret) > + return ERR_PTR(ret); > + > + return hdev; > +} > +EXPORT_SYMBOL_GPL(devm_scrub_device_register); > + > +static int __init memory_scrub_control_init(void) > +{ > + return class_register(&scrub_class); > +} > +subsys_initcall(memory_scrub_control_init); > + > +static void memory_scrub_control_exit(void) > +{ > + class_unregister(&scrub_class); > +} > +module_exit(memory_scrub_control_exit); > diff --git a/include/linux/memory_scrub.h b/include/linux/memory_scrub.h > new file mode 100755 > index 000000000000..f0e1657a5072 > --- /dev/null > +++ b/include/linux/memory_scrub.h > @@ -0,0 +1,37 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Memory scrub subsystem driver supports controlling > + * the memory scrubbers in the system. > + * > + * Copyright (c) 2024 HiSilicon Limited. > + */ > + > +#ifndef __MEMORY_SCRUB_H > +#define __MEMORY_SCRUB_H > + > +#include <linux/types.h> > + > +struct device; > + > +/** > + * struct scrub_ops - scrub device operations (all elements optional) > + * @get_enabled_bg: check if currently performing background scrub. > + * @set_enabled_bg: start or stop a bg-scrub. > + * @get_name: get the memory scrubber name. > + * @rate_avail_range: retrieve limits on supported rates. > + * @rate_read: read the scrub rate > + * @rate_write: set the scrub rate > + */ > +struct scrub_ops { > + int (*get_enabled_bg)(struct device *dev, bool *enable); > + int (*set_enabled_bg)(struct device *dev, bool enable); > + int (*get_name)(struct device *dev, char *buf); > + int (*rate_avail_range)(struct device *dev, u64 *min, u64 *max); > + int (*rate_read)(struct device *dev, u64 *rate); > + int (*rate_write)(struct device *dev, u64 rate); > +}; > + > +struct device * > +devm_scrub_device_register(struct device *dev, void *drvdata, > + const struct scrub_ops *ops); > +#endif /* __MEMORY_SCRUB_H */ > -- > 2.34.1 >
On Sat, Apr 20, 2024 at 12:47:10AM +0800, shiju.jose@huawei.com wrote: > From: Shiju Jose <shiju.jose@huawei.com> > > Add scrub subsystem supports configuring the memory scrubbers > in the system. The scrub subsystem provides the interface for > registering the scrub devices. The scrub control attributes > are provided to the user in /sys/class/ras/rasX/scrub > > Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Signed-off-by: Shiju Jose <shiju.jose@huawei.com> > --- > .../ABI/testing/sysfs-class-scrub-configure | 47 +++ > drivers/ras/Kconfig | 7 + > drivers/ras/Makefile | 1 + > drivers/ras/memory_scrub.c | 271 ++++++++++++++++++ > include/linux/memory_scrub.h | 37 +++ > 5 files changed, 363 insertions(+) > create mode 100644 Documentation/ABI/testing/sysfs-class-scrub-configure > create mode 100755 drivers/ras/memory_scrub.c > create mode 100755 include/linux/memory_scrub.h ERROR: modpost: missing MODULE_LICENSE() in drivers/ras/memory_scrub.o make[2]: *** [scripts/Makefile.modpost:145: Module.symvers] Error 1 make[1]: *** [/mnt/kernel/kernel/2nd/linux/Makefile:1871: modpost] Error 2 make: *** [Makefile:240: __sub-make] Error 2 Each patch of yours needs to build. > diff --git a/Documentation/ABI/testing/sysfs-class-scrub-configure b/Documentation/ABI/testing/sysfs-class-scrub-configure > new file mode 100644 > index 000000000000..3ed77dbb00ad > --- /dev/null > +++ b/Documentation/ABI/testing/sysfs-class-scrub-configure > @@ -0,0 +1,47 @@ > +What: /sys/class/ras/ > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + The ras/ class subdirectory belongs to the > + common ras features such as scrub subsystem. > + > +What: /sys/class/ras/rasX/scrub/ > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + The /sys/class/ras/ras{0,1,2,3,...}/scrub directories You have different scrubbers. I'd prefer if you put their names in here instead and do this structure: /sys/class/ras/scrub/cxl-patrol /ars /cxl-ecs /acpi-ras2 and so on. Unless the idea is for those devices to have multiple RAS-specific functionality than just scrubbing. Then you want to do /sys/class/ras/cxl/scrub /other_function /sys/class/ras/ars/scrub /... You get the idea. > + correspond to each scrub device registered with the > + scrub subsystem. > + > +What: /sys/class/ras/rasX/scrub/name > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RO) name of the memory scrubber > + > +What: /sys/class/ras/rasX/scrub/enable_background > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RW) Enable/Disable background(patrol) scrubbing if supported. > + > +What: /sys/class/ras/rasX/scrub/rate_available That's dumping a range so I guess it should be called probably "possible_rates" or so, so that it is clear what it means. If some scrubbers support only a discrete set of rate values, then "possible_rates" fits too if you dump them as a list of values. > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RO) Supported range for the scrub rate by the scrubber. > + The scrub rate represents in hours. > + > +What: /sys/class/ras/rasX/scrub/rate > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RW) The scrub rate specified and it must be with in the > + supported range by the scrubber. > + The scrub rate represents in hours. > diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig > index fc4f4bb94a4c..181701479564 100644 > --- a/drivers/ras/Kconfig > +++ b/drivers/ras/Kconfig > @@ -46,4 +46,11 @@ config RAS_FMPM > Memory will be retired during boot time and run time depending on > platform-specific policies. > > +config SCRUB > + tristate "Memory scrub driver" > + help > + This option selects the memory scrub subsystem, supports s/This option selects/Enable/ > + configuring the parameters of underlying scrubbers in the > + system for the DRAM memories. > + > endif > diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile > index 11f95d59d397..89bcf0d84355 100644 > --- a/drivers/ras/Makefile > +++ b/drivers/ras/Makefile > @@ -2,6 +2,7 @@ > obj-$(CONFIG_RAS) += ras.o > obj-$(CONFIG_DEBUG_FS) += debugfs.o > obj-$(CONFIG_RAS_CEC) += cec.o > +obj-$(CONFIG_SCRUB) += memory_scrub.o > > obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o > obj-y += amd/atl/ > diff --git a/drivers/ras/memory_scrub.c b/drivers/ras/memory_scrub.c > new file mode 100755 > index 000000000000..7e995380ec3a > --- /dev/null > +++ b/drivers/ras/memory_scrub.c > @@ -0,0 +1,271 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Memory scrub subsystem supports configuring the registered > + * memory scrubbers. > + * > + * Copyright (c) 2024 HiSilicon Limited. > + */ > + > +#define pr_fmt(fmt) "MEM SCRUB: " fmt > + > +#include <linux/acpi.h> > +#include <linux/bitops.h> > +#include <linux/delay.h> > +#include <linux/kfifo.h> > +#include <linux/memory_scrub.h> > +#include <linux/platform_device.h> > +#include <linux/spinlock.h> > + > +/* memory scrubber config definitions */ No need for that comment. > +static ssize_t rate_available_show(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + u64 min_sr, max_sr; > + int ret; > + > + ret = scrub_dev->ops->rate_avail_range(dev, &min_sr, &max_sr); > + if (ret) > + return ret; > + > + return sysfs_emit(buf, "0x%llx-0x%llx\n", min_sr, max_sr); > +} This glue driver will need to store the min and max scrub rates on init and rate_store() will have to verify the newly supplied rate is within that range before writing it. Not the user, nor the underlying hw driver. > + > +DEVICE_ATTR_RW(enable_background); > +DEVICE_ATTR_RO(name); > +DEVICE_ATTR_RW(rate); > +DEVICE_ATTR_RO(rate_available); static > + > +static struct attribute *scrub_attrs[] = { > + &dev_attr_enable_background.attr, > + &dev_attr_name.attr, > + &dev_attr_rate.attr, > + &dev_attr_rate_available.attr, > + NULL > +}; > + > +static umode_t scrub_attr_visible(struct kobject *kobj, > + struct attribute *a, int attr_id) > +{ > + struct device *dev = kobj_to_dev(kobj); > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + const struct scrub_ops *ops = scrub_dev->ops; > + > + if (a == &dev_attr_enable_background.attr) { > + if (ops->set_enabled_bg && ops->get_enabled_bg) > + return a->mode; > + if (ops->get_enabled_bg) > + return 0444; > + return 0; > + } > + if (a == &dev_attr_name.attr) > + return ops->get_name ? a->mode : 0; > + if (a == &dev_attr_rate_available.attr) > + return ops->rate_avail_range ? a->mode : 0; > + if (a == &dev_attr_rate.attr) { /* Write only makes little sense */ > + if (ops->rate_read && ops->rate_write) > + return a->mode; > + if (ops->rate_read) > + return 0444; > + return 0; > + } All of that stuff's permissions should be root-only. > + > + return 0; > +} > + > +static const struct attribute_group scrub_attr_group = { > + .name = "scrub", > + .attrs = scrub_attrs, > + .is_visible = scrub_attr_visible, > +}; > + > +static const struct attribute_group *scrub_attr_groups[] = { > + &scrub_attr_group, > + NULL > +}; > + > +static void scrub_dev_release(struct device *dev) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + > + ida_free(&scrub_ida, scrub_dev->id); > + kfree(scrub_dev); > +} > + > +static struct class scrub_class = { > + .name = "ras", > + .dev_groups = scrub_attr_groups, > + .dev_release = scrub_dev_release, > +}; > + > +static struct device * > +scrub_device_register(struct device *parent, void *drvdata, > + const struct scrub_ops *ops) > +{ > + struct scrub_device *scrub_dev; > + struct device *hdev; > + int err; > + > + scrub_dev = kzalloc(sizeof(*scrub_dev), GFP_KERNEL); > + if (!scrub_dev) > + return ERR_PTR(-ENOMEM); > + hdev = &scrub_dev->dev; > + > + scrub_dev->id = ida_alloc(&scrub_ida, GFP_KERNEL); What's that silly thing for? > + if (scrub_dev->id < 0) { > + kfree(scrub_dev); > + return ERR_PTR(-ENOMEM); > + } > + > + scrub_dev->ops = ops; > + hdev->class = &scrub_class; > + hdev->parent = parent; > + dev_set_drvdata(hdev, drvdata); > + dev_set_name(hdev, SCRUB_ID_FORMAT, scrub_dev->id); > + err = device_register(hdev); > + if (err) { > + put_device(hdev); > + return ERR_PTR(err); > + } > + > + return hdev; > +} > + > +static void devm_scrub_release(void *dev) > +{ > + device_unregister(dev); > +} > + > +/** > + * devm_scrub_device_register - register scrubber device > + * @dev: the parent device > + * @drvdata: driver data to attach to the scrub device > + * @ops: pointer to scrub_ops structure (optional) > + * > + * Returns the pointer to the new device on success, ERR_PTR() otherwise. > + * The new device would be automatically unregistered with the parent device. > + */ > +struct device * > +devm_scrub_device_register(struct device *dev, void *drvdata, > + const struct scrub_ops *ops) > +{ > + struct device *hdev; > + int ret; > + > + if (!dev) > + return ERR_PTR(-EINVAL); > + > + hdev = scrub_device_register(dev, drvdata, ops); > + if (IS_ERR(hdev)) > + return hdev; > + > + ret = devm_add_action_or_reset(dev, devm_scrub_release, hdev); > + if (ret) > + return ERR_PTR(ret); > + > + return hdev; > +} > +EXPORT_SYMBOL_GPL(devm_scrub_device_register); > + > +static int __init memory_scrub_control_init(void) > +{ > + return class_register(&scrub_class); > +} > +subsys_initcall(memory_scrub_control_init); You can't just blindly register this thing without checking whether there are even any hw scrubber devices on the system.
>-----Original Message----- >From: fan <nifan.cxl@gmail.com> >Sent: 24 April 2024 21:26 >To: Shiju Jose <shiju.jose@huawei.com> >Cc: linux-cxl@vger.kernel.org; linux-acpi@vger.kernel.org; linux- >mm@kvack.org; dan.j.williams@intel.com; dave@stgolabs.net; Jonathan >Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com; >alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com; >linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; david@redhat.com; >Vilas.Sridharan@amd.com; leo.duran@amd.com; Yazen.Ghannam@amd.com; >rientjes@google.com; jiaqiyan@google.com; tony.luck@intel.com; >Jon.Grimm@amd.com; dave.hansen@linux.intel.com; rafael@kernel.org; >lenb@kernel.org; naoya.horiguchi@nec.com; james.morse@arm.com; >jthoughton@google.com; somasundaram.a@hpe.com; >erdemaktas@google.com; pgonda@google.com; duenwen@google.com; >mike.malvestuto@intel.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; >Linuxarm <linuxarm@huawei.com> >Subject: Re: [RFC PATCH v8 01/10] ras: scrub: Add scrub subsystem > >On Sat, Apr 20, 2024 at 12:47:10AM +0800, shiju.jose@huawei.com wrote: >> From: Shiju Jose <shiju.jose@huawei.com> >> >> Add scrub subsystem supports configuring the memory scrubbers in the >> system. The scrub subsystem provides the interface for registering the >> scrub devices. The scrub control attributes are provided to the user >> in /sys/class/ras/rasX/scrub >> >> Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> >> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> >> Signed-off-by: Shiju Jose <shiju.jose@huawei.com> >> --- >> .../ABI/testing/sysfs-class-scrub-configure | 47 +++ >> drivers/ras/Kconfig | 7 + >> drivers/ras/Makefile | 1 + >> drivers/ras/memory_scrub.c | 271 ++++++++++++++++++ >> include/linux/memory_scrub.h | 37 +++ >> 5 files changed, 363 insertions(+) >> create mode 100644 >> Documentation/ABI/testing/sysfs-class-scrub-configure >> create mode 100755 drivers/ras/memory_scrub.c create mode 100755 >> include/linux/memory_scrub.h >> >> diff --git a/Documentation/ABI/testing/sysfs-class-scrub-configure >> b/Documentation/ABI/testing/sysfs-class-scrub-configure >> new file mode 100644 >> index 000000000000..3ed77dbb00ad >> --- /dev/null >> +++ b/Documentation/ABI/testing/sysfs-class-scrub-configure >> @@ -0,0 +1,47 @@ >> +What: /sys/class/ras/ >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + The ras/ class subdirectory belongs to the >> + common ras features such as scrub subsystem. >> + >> +What: /sys/class/ras/rasX/scrub/ >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + The /sys/class/ras/ras{0,1,2,3,...}/scrub directories >> + correspond to each scrub device registered with the >> + scrub subsystem. >> + >> +What: /sys/class/ras/rasX/scrub/name >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + (RO) name of the memory scrubber >> + >> +What: /sys/class/ras/rasX/scrub/enable_background >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + (RW) Enable/Disable background(patrol) scrubbing if supported. >> + >> +What: /sys/class/ras/rasX/scrub/rate_available >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + (RO) Supported range for the scrub rate by the scrubber. >> + The scrub rate represents in hours. >> + >> +What: /sys/class/ras/rasX/scrub/rate >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + (RW) The scrub rate specified and it must be with in the >> + supported range by the scrubber. >> + The scrub rate represents in hours. >> diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index >> fc4f4bb94a4c..181701479564 100644 >> --- a/drivers/ras/Kconfig >> +++ b/drivers/ras/Kconfig >> @@ -46,4 +46,11 @@ config RAS_FMPM >> Memory will be retired during boot time and run time depending on >> platform-specific policies. >> >> +config SCRUB >> + tristate "Memory scrub driver" >> + help >> + This option selects the memory scrub subsystem, supports >> + configuring the parameters of underlying scrubbers in the >> + system for the DRAM memories. >> + >> endif >> diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index >> 11f95d59d397..89bcf0d84355 100644 >> --- a/drivers/ras/Makefile >> +++ b/drivers/ras/Makefile >> @@ -2,6 +2,7 @@ >> obj-$(CONFIG_RAS) += ras.o >> obj-$(CONFIG_DEBUG_FS) += debugfs.o >> obj-$(CONFIG_RAS_CEC) += cec.o >> +obj-$(CONFIG_SCRUB) += memory_scrub.o >> >> obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o >> obj-y += amd/atl/ >> diff --git a/drivers/ras/memory_scrub.c b/drivers/ras/memory_scrub.c >> new file mode 100755 index 000000000000..7e995380ec3a >> --- /dev/null >> +++ b/drivers/ras/memory_scrub.c >> @@ -0,0 +1,271 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* >> + * Memory scrub subsystem supports configuring the registered >> + * memory scrubbers. >> + * >> + * Copyright (c) 2024 HiSilicon Limited. >> + */ >> + >> +#define pr_fmt(fmt) "MEM SCRUB: " fmt >> + >> +#include <linux/acpi.h> >> +#include <linux/bitops.h> >> +#include <linux/delay.h> >> +#include <linux/kfifo.h> >> +#include <linux/memory_scrub.h> >> +#include <linux/platform_device.h> >> +#include <linux/spinlock.h> >> + >> +/* memory scrubber config definitions */ #define SCRUB_ID_PREFIX >> +"ras" >> +#define SCRUB_ID_FORMAT SCRUB_ID_PREFIX "%d" >> + >> +static DEFINE_IDA(scrub_ida); >> + >> +struct scrub_device { >> + int id; >> + struct device dev; >> + const struct scrub_ops *ops; >> +}; >> + >> +#define to_scrub_device(d) container_of(d, struct scrub_device, dev) >> +static ssize_t enable_background_store(struct device *dev, >> + struct device_attribute *attr, >> + const char *buf, size_t len) { >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + bool enable; >> + int ret; >> + >> + ret = kstrtobool(buf, &enable); >> + if (ret < 0) >> + return ret; >> + >> + ret = scrub_dev->ops->set_enabled_bg(dev, enable); >> + if (ret) >> + return ret; >> + >> + return len; >> +} >> + >> +static ssize_t enable_background_show(struct device *dev, >> + struct device_attribute *attr, char *buf) { >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + bool enable; >> + int ret; >> + >> + ret = scrub_dev->ops->get_enabled_bg(dev, &enable); >> + if (ret) >> + return ret; >> + >> + return sysfs_emit(buf, "%d\n", enable); } >> + >> +static ssize_t name_show(struct device *dev, >> + struct device_attribute *attr, char *buf) { >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + int ret; >> + >> + ret = scrub_dev->ops->get_name(dev, buf); >> + if (ret) >> + return ret; >> + >> + return strlen(buf); >> +} >> + >> +static ssize_t rate_show(struct device *dev, struct device_attribute *attr, >> + char *buf) >> +{ >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + u64 val; >> + int ret; >> + >> + ret = scrub_dev->ops->rate_read(dev, &val); >> + if (ret) >> + return ret; >> + >> + return sysfs_emit(buf, "0x%llx\n", val); } >> + >> +static ssize_t rate_store(struct device *dev, struct device_attribute *attr, >> + const char *buf, size_t len) >> +{ >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + long val; >> + int ret; >> + >> + ret = kstrtol(buf, 10, &val); >> + if (ret < 0) >> + return ret; >> + >> + ret = scrub_dev->ops->rate_write(dev, val); >> + if (ret) >> + return ret; >> + >> + return len; >> +} >> + >> +static ssize_t rate_available_show(struct device *dev, >> + struct device_attribute *attr, >> + char *buf) >> +{ >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + u64 min_sr, max_sr; >> + int ret; >> + >> + ret = scrub_dev->ops->rate_avail_range(dev, &min_sr, &max_sr); >> + if (ret) >> + return ret; >> + >> + return sysfs_emit(buf, "0x%llx-0x%llx\n", min_sr, max_sr); } >> + >> +DEVICE_ATTR_RW(enable_background); >> +DEVICE_ATTR_RO(name); >> +DEVICE_ATTR_RW(rate); >> +DEVICE_ATTR_RO(rate_available); >> + >> +static struct attribute *scrub_attrs[] = { >> + &dev_attr_enable_background.attr, >> + &dev_attr_name.attr, >> + &dev_attr_rate.attr, >> + &dev_attr_rate_available.attr, >> + NULL >> +}; >> + >> +static umode_t scrub_attr_visible(struct kobject *kobj, >> + struct attribute *a, int attr_id) { >> + struct device *dev = kobj_to_dev(kobj); >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + const struct scrub_ops *ops = scrub_dev->ops; >> + >> + if (a == &dev_attr_enable_background.attr) { >> + if (ops->set_enabled_bg && ops->get_enabled_bg) >> + return a->mode; >> + if (ops->get_enabled_bg) >> + return 0444; >> + return 0; >> + } >> + if (a == &dev_attr_name.attr) >> + return ops->get_name ? a->mode : 0; >> + if (a == &dev_attr_rate_available.attr) >> + return ops->rate_avail_range ? a->mode : 0; >> + if (a == &dev_attr_rate.attr) { /* Write only makes little sense */ >> + if (ops->rate_read && ops->rate_write) >> + return a->mode; >> + if (ops->rate_read) >> + return 0444; >> + return 0; >> + } >> + >> + return 0; >> +} >> + >> +static const struct attribute_group scrub_attr_group = { >> + .name = "scrub", >> + .attrs = scrub_attrs, >> + .is_visible = scrub_attr_visible, >> +}; >> + >> +static const struct attribute_group *scrub_attr_groups[] = { >> + &scrub_attr_group, >> + NULL >> +}; >> + >> +static void scrub_dev_release(struct device *dev) { >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + >> + ida_free(&scrub_ida, scrub_dev->id); >> + kfree(scrub_dev); >> +} >> + >> +static struct class scrub_class = { >> + .name = "ras", >> + .dev_groups = scrub_attr_groups, >> + .dev_release = scrub_dev_release, >> +}; >> + >> +static struct device * >> +scrub_device_register(struct device *parent, void *drvdata, >> + const struct scrub_ops *ops) >> +{ >> + struct scrub_device *scrub_dev; >> + struct device *hdev; >> + int err; >> + >> + scrub_dev = kzalloc(sizeof(*scrub_dev), GFP_KERNEL); >> + if (!scrub_dev) >> + return ERR_PTR(-ENOMEM); >> + hdev = &scrub_dev->dev; >> + >> + scrub_dev->id = ida_alloc(&scrub_ida, GFP_KERNEL); >> + if (scrub_dev->id < 0) { >> + kfree(scrub_dev); >> + return ERR_PTR(-ENOMEM); >> + } >> + >> + scrub_dev->ops = ops; >> + hdev->class = &scrub_class; >> + hdev->parent = parent; >> + dev_set_drvdata(hdev, drvdata); >> + dev_set_name(hdev, SCRUB_ID_FORMAT, scrub_dev->id); > >Need to check the return value of dev_set_name? Will do, though checking return value of dev_set_name() is not common in the kernel. > >fan > >> + err = device_register(hdev); >> + if (err) { Thanks, Shiju
Hi Boris, Thanks for the feedbacks. Please find reply inline, Thanks, Shiju >-----Original Message----- >From: Borislav Petkov <bp@alien8.de> >Sent: 25 April 2024 11:16 >To: Shiju Jose <shiju.jose@huawei.com> >Cc: linux-cxl@vger.kernel.org; linux-acpi@vger.kernel.org; linux- >mm@kvack.org; dan.j.williams@intel.com; dave@stgolabs.net; Jonathan >Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com; >alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com; >linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; david@redhat.com; >Vilas.Sridharan@amd.com; leo.duran@amd.com; Yazen.Ghannam@amd.com; >rientjes@google.com; jiaqiyan@google.com; tony.luck@intel.com; >Jon.Grimm@amd.com; dave.hansen@linux.intel.com; rafael@kernel.org; >lenb@kernel.org; naoya.horiguchi@nec.com; james.morse@arm.com; >jthoughton@google.com; somasundaram.a@hpe.com; >erdemaktas@google.com; pgonda@google.com; duenwen@google.com; >mike.malvestuto@intel.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; >Linuxarm <linuxarm@huawei.com> >Subject: Re: [RFC PATCH v8 01/10] ras: scrub: Add scrub subsystem > >On Sat, Apr 20, 2024 at 12:47:10AM +0800, shiju.jose@huawei.com wrote: >> From: Shiju Jose <shiju.jose@huawei.com> >> >> Add scrub subsystem supports configuring the memory scrubbers in the >> system. The scrub subsystem provides the interface for registering the >> scrub devices. The scrub control attributes are provided to the user >> in /sys/class/ras/rasX/scrub >> >> Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> >> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> >> Signed-off-by: Shiju Jose <shiju.jose@huawei.com> >> --- >> .../ABI/testing/sysfs-class-scrub-configure | 47 +++ >> drivers/ras/Kconfig | 7 + >> drivers/ras/Makefile | 1 + >> drivers/ras/memory_scrub.c | 271 ++++++++++++++++++ >> include/linux/memory_scrub.h | 37 +++ >> 5 files changed, 363 insertions(+) >> create mode 100644 >> Documentation/ABI/testing/sysfs-class-scrub-configure >> create mode 100755 drivers/ras/memory_scrub.c create mode 100755 >> include/linux/memory_scrub.h > >ERROR: modpost: missing MODULE_LICENSE() in drivers/ras/memory_scrub.o >make[2]: *** [scripts/Makefile.modpost:145: Module.symvers] Error 1 >make[1]: *** [/mnt/kernel/kernel/2nd/linux/Makefile:1871: modpost] Error 2 >make: *** [Makefile:240: __sub-make] Error 2 > >Each patch of yours needs to build. Fixed. > >> diff --git a/Documentation/ABI/testing/sysfs-class-scrub-configure >> b/Documentation/ABI/testing/sysfs-class-scrub-configure >> new file mode 100644 >> index 000000000000..3ed77dbb00ad >> --- /dev/null >> +++ b/Documentation/ABI/testing/sysfs-class-scrub-configure >> @@ -0,0 +1,47 @@ >> +What: /sys/class/ras/ >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + The ras/ class subdirectory belongs to the >> + common ras features such as scrub subsystem. >> + >> +What: /sys/class/ras/rasX/scrub/ >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + The /sys/class/ras/ras{0,1,2,3,...}/scrub directories > >You have different scrubbers. > >I'd prefer if you put their names in here instead and do this structure: > >/sys/class/ras/scrub/cxl-patrol > /ars > /cxl-ecs > /acpi-ras2 > >and so on. > >Unless the idea is for those devices to have multiple RAS-specific functionality >than just scrubbing. Then you want to do > >/sys/class/ras/cxl/scrub > /other_function > >/sys/class/ras/ars/scrub > /... > >You get the idea. It is expected to have multiple RAS-specific functionalities other than scrubbing in long run. Most of the classes in the kernel found as /sys/class/<class-name>/<class-name>X/ If not, however /sys/class/ras/<module -name>X/<feature> is more suitable because there are multiple device instances such as cxl devices with scrub control feature. For example, /sys/class/ras/cxlX/scrub > >> + correspond to each scrub device registered with the >> + scrub subsystem. >> + >> +What: /sys/class/ras/rasX/scrub/name >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + (RO) name of the memory scrubber >> + >> +What: /sys/class/ras/rasX/scrub/enable_background >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + (RW) Enable/Disable background(patrol) scrubbing if supported. >> + >> +What: /sys/class/ras/rasX/scrub/rate_available > >That's dumping a range so I guess it should be called probably "possible_rates" >or so, so that it is clear what it means. > >If some scrubbers support only a discrete set of rate values, then >"possible_rates" fits too if you dump them as a list of values. Sure. Will check. > >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + (RO) Supported range for the scrub rate by the scrubber. >> + The scrub rate represents in hours. >> + >> +What: /sys/class/ras/rasX/scrub/rate >> +Date: March 2024 >> +KernelVersion: 6.9 >> +Contact: linux-kernel@vger.kernel.org >> +Description: >> + (RW) The scrub rate specified and it must be with in the >> + supported range by the scrubber. >> + The scrub rate represents in hours. >> diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index >> fc4f4bb94a4c..181701479564 100644 >> --- a/drivers/ras/Kconfig >> +++ b/drivers/ras/Kconfig >> @@ -46,4 +46,11 @@ config RAS_FMPM >> Memory will be retired during boot time and run time depending on >> platform-specific policies. >> >> +config SCRUB >> + tristate "Memory scrub driver" >> + help >> + This option selects the memory scrub subsystem, supports > >s/This option selects/Enable/ Sure. > >> + configuring the parameters of underlying scrubbers in the >> + system for the DRAM memories. >> + >> endif >> diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index >> 11f95d59d397..89bcf0d84355 100644 >> --- a/drivers/ras/Makefile >> +++ b/drivers/ras/Makefile >> @@ -2,6 +2,7 @@ >> obj-$(CONFIG_RAS) += ras.o >> obj-$(CONFIG_DEBUG_FS) += debugfs.o >> obj-$(CONFIG_RAS_CEC) += cec.o >> +obj-$(CONFIG_SCRUB) += memory_scrub.o >> >> obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o >> obj-y += amd/atl/ >> diff --git a/drivers/ras/memory_scrub.c b/drivers/ras/memory_scrub.c >> new file mode 100755 index 000000000000..7e995380ec3a >> --- /dev/null >> +++ b/drivers/ras/memory_scrub.c >> @@ -0,0 +1,271 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* >> + * Memory scrub subsystem supports configuring the registered >> + * memory scrubbers. >> + * >> + * Copyright (c) 2024 HiSilicon Limited. >> + */ >> + >> +#define pr_fmt(fmt) "MEM SCRUB: " fmt >> + >> +#include <linux/acpi.h> >> +#include <linux/bitops.h> >> +#include <linux/delay.h> >> +#include <linux/kfifo.h> >> +#include <linux/memory_scrub.h> >> +#include <linux/platform_device.h> >> +#include <linux/spinlock.h> >> + >> +/* memory scrubber config definitions */ > >No need for that comment. Will remove. > >> +static ssize_t rate_available_show(struct device *dev, >> + struct device_attribute *attr, >> + char *buf) >> +{ >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + u64 min_sr, max_sr; >> + int ret; >> + >> + ret = scrub_dev->ops->rate_avail_range(dev, &min_sr, &max_sr); >> + if (ret) >> + return ret; >> + >> + return sysfs_emit(buf, "0x%llx-0x%llx\n", min_sr, max_sr); } > >This glue driver will need to store the min and max scrub rates on init and >rate_store() will have to verify the newly supplied rate is within that range >before writing it. > >Not the user, nor the underlying hw driver. Presently underlying hw driver does the check. I think this will become more complex if does in the common rate_store() if we have to check against either a list of possible rates or min and max rates. > >> + >> +DEVICE_ATTR_RW(enable_background); >> +DEVICE_ATTR_RO(name); >> +DEVICE_ATTR_RW(rate); >> +DEVICE_ATTR_RO(rate_available); > >static > >> + >> +static struct attribute *scrub_attrs[] = { >> + &dev_attr_enable_background.attr, >> + &dev_attr_name.attr, >> + &dev_attr_rate.attr, >> + &dev_attr_rate_available.attr, >> + NULL >> +}; >> + >> +static umode_t scrub_attr_visible(struct kobject *kobj, >> + struct attribute *a, int attr_id) { >> + struct device *dev = kobj_to_dev(kobj); >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + const struct scrub_ops *ops = scrub_dev->ops; >> + >> + if (a == &dev_attr_enable_background.attr) { >> + if (ops->set_enabled_bg && ops->get_enabled_bg) >> + return a->mode; >> + if (ops->get_enabled_bg) >> + return 0444; >> + return 0; >> + } >> + if (a == &dev_attr_name.attr) >> + return ops->get_name ? a->mode : 0; >> + if (a == &dev_attr_rate_available.attr) >> + return ops->rate_avail_range ? a->mode : 0; >> + if (a == &dev_attr_rate.attr) { /* Write only makes little sense */ >> + if (ops->rate_read && ops->rate_write) >> + return a->mode; >> + if (ops->rate_read) >> + return 0444; >> + return 0; >> + } > >All of that stuff's permissions should be root-only. Sure. > >> + >> + return 0; >> +} >> + >> +static const struct attribute_group scrub_attr_group = { >> + .name = "scrub", >> + .attrs = scrub_attrs, >> + .is_visible = scrub_attr_visible, >> +}; >> + >> +static const struct attribute_group *scrub_attr_groups[] = { >> + &scrub_attr_group, >> + NULL >> +}; >> + >> +static void scrub_dev_release(struct device *dev) { >> + struct scrub_device *scrub_dev = to_scrub_device(dev); >> + >> + ida_free(&scrub_ida, scrub_dev->id); >> + kfree(scrub_dev); >> +} >> + >> +static struct class scrub_class = { >> + .name = "ras", >> + .dev_groups = scrub_attr_groups, >> + .dev_release = scrub_dev_release, >> +}; >> + >> +static struct device * >> +scrub_device_register(struct device *parent, void *drvdata, >> + const struct scrub_ops *ops) >> +{ >> + struct scrub_device *scrub_dev; >> + struct device *hdev; >> + int err; >> + >> + scrub_dev = kzalloc(sizeof(*scrub_dev), GFP_KERNEL); >> + if (!scrub_dev) >> + return ERR_PTR(-ENOMEM); >> + hdev = &scrub_dev->dev; >> + >> + scrub_dev->id = ida_alloc(&scrub_ida, GFP_KERNEL); > >What's that silly thing for? This is the ras instance id (X) used for scrub control feature, /sys/class/ras/rasX/scrub/ > >> + if (scrub_dev->id < 0) { >> + kfree(scrub_dev); >> + return ERR_PTR(-ENOMEM); >> + } >> + >> + scrub_dev->ops = ops; >> + hdev->class = &scrub_class; >> + hdev->parent = parent; >> + dev_set_drvdata(hdev, drvdata); >> + dev_set_name(hdev, SCRUB_ID_FORMAT, scrub_dev->id); >> + err = device_register(hdev); >> + if (err) { >> + put_device(hdev); >> + return ERR_PTR(err); >> + } >> + >> + return hdev; >> +} >> + >> +static void devm_scrub_release(void *dev) { >> + device_unregister(dev); >> +} >> + >> +/** >> + * devm_scrub_device_register - register scrubber device >> + * @dev: the parent device >> + * @drvdata: driver data to attach to the scrub device >> + * @ops: pointer to scrub_ops structure (optional) >> + * >> + * Returns the pointer to the new device on success, ERR_PTR() otherwise. >> + * The new device would be automatically unregistered with the parent >device. >> + */ >> +struct device * >> +devm_scrub_device_register(struct device *dev, void *drvdata, >> + const struct scrub_ops *ops) >> +{ >> + struct device *hdev; >> + int ret; >> + >> + if (!dev) >> + return ERR_PTR(-EINVAL); >> + >> + hdev = scrub_device_register(dev, drvdata, ops); >> + if (IS_ERR(hdev)) >> + return hdev; >> + >> + ret = devm_add_action_or_reset(dev, devm_scrub_release, hdev); >> + if (ret) >> + return ERR_PTR(ret); >> + >> + return hdev; >> +} >> +EXPORT_SYMBOL_GPL(devm_scrub_device_register); >> + >> +static int __init memory_scrub_control_init(void) { >> + return class_register(&scrub_class); } >> +subsys_initcall(memory_scrub_control_init); > >You can't just blindly register this thing without checking whether there are even >any hw scrubber devices on the system. I think it happens only when a dependent module as autoloaded based on a scrub device existing with exception of memory scrub control built in and who would build this in? > >-- >Regards/Gruss, > Boris. > Thanks, Shiju
On Thu, Apr 25, 2024 at 06:11:13PM +0000, Shiju Jose wrote: > It is expected to have multiple RAS-specific functionalities other > than scrubbing in long run. Most of the classes in the kernel found > as /sys/class/<class-name>/<class-name>X/ > > If not, however /sys/class/ras/<module -name>X/<feature> is more > suitable because there are multiple device instances such as cxl > devices with scrub control feature. For example, > /sys/class/ras/cxlX/scrub Make it as user-friendly as possible. cxlX is not as user-friendly as /sys/class/ras/cxl/<mem_accelerator> /<fancy_bla_thing> and so on. Yes, you can introduce a special category .../ras/cxl/ if there are multiple cxl devices which have RAS functionality on them. > Presently underlying hw driver does the check. I think this will > become more complex if does in the common rate_store() if we have to > check against either a list of possible rates or min and max rates. Ok. > >> +DEVICE_ATTR_RW(enable_background); > >> +DEVICE_ATTR_RO(name); > >> +DEVICE_ATTR_RW(rate); > >> +DEVICE_ATTR_RO(rate_available); > > > >static Forgot one. > This is the ras instance id (X) used for scrub control feature, /sys/class/ras/rasX/scrub/ Yeah, as discussed above. > >> +static int __init memory_scrub_control_init(void) { > >> + return class_register(&scrub_class); } > >> +subsys_initcall(memory_scrub_control_init); > > > >You can't just blindly register this thing without checking whether there are even > >any hw scrubber devices on the system. > > I think it happens only when a dependent module as autoloaded based > on a scrub device existing with exception of memory scrub control > built in and who would build this in? You think or you know?
>-----Original Message----- >From: Borislav Petkov <bp@alien8.de> >Sent: 06 May 2024 11:30 >To: Shiju Jose <shiju.jose@huawei.com> >Cc: linux-cxl@vger.kernel.org; linux-acpi@vger.kernel.org; linux- >mm@kvack.org; dan.j.williams@intel.com; dave@stgolabs.net; Jonathan >Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com; >alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com; >linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; david@redhat.com; >Vilas.Sridharan@amd.com; leo.duran@amd.com; Yazen.Ghannam@amd.com; >rientjes@google.com; jiaqiyan@google.com; tony.luck@intel.com; >Jon.Grimm@amd.com; dave.hansen@linux.intel.com; rafael@kernel.org; >lenb@kernel.org; naoya.horiguchi@nec.com; james.morse@arm.com; >jthoughton@google.com; somasundaram.a@hpe.com; >erdemaktas@google.com; pgonda@google.com; duenwen@google.com; >mike.malvestuto@intel.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; >Linuxarm <linuxarm@huawei.com> >Subject: Re: [RFC PATCH v8 01/10] ras: scrub: Add scrub subsystem > >On Thu, Apr 25, 2024 at 06:11:13PM +0000, Shiju Jose wrote: >> It is expected to have multiple RAS-specific functionalities other >> than scrubbing in long run. Most of the classes in the kernel found >> as /sys/class/<class-name>/<class-name>X/ >> >> If not, however /sys/class/ras/<module -name>X/<feature> is more >> suitable because there are multiple device instances such as cxl >> devices with scrub control feature. For example, >> /sys/class/ras/cxlX/scrub > >Make it as user-friendly as possible. cxlX is not as user-friendly as > >/sys/class/ras/cxl/<mem_accelerator> > /<fancy_bla_thing> > >and so on. > >Yes, you can introduce a special category .../ras/cxl/ if there are multiple cxl >devices which have RAS functionality on them. Sure. > >> Presently underlying hw driver does the check. I think this will >> become more complex if does in the common rate_store() if we have to >> check against either a list of possible rates or min and max rates. > >Ok. > >> >> +DEVICE_ATTR_RW(enable_background); >> >> +DEVICE_ATTR_RO(name); >> >> +DEVICE_ATTR_RW(rate); >> >> +DEVICE_ATTR_RO(rate_available); >> > >> >static > >Forgot one. Will correct. > >> This is the ras instance id (X) used for scrub control feature, >> /sys/class/ras/rasX/scrub/ > >Yeah, as discussed above. OK. > >> >> +static int __init memory_scrub_control_init(void) { >> >> + return class_register(&scrub_class); } >> >> +subsys_initcall(memory_scrub_control_init); >> > >> >You can't just blindly register this thing without checking whether >> >there are even any hw scrubber devices on the system. >> >> I think it happens only when a dependent module as autoloaded based on >> a scrub device existing with exception of memory scrub control built >> in and who would build this in? > >You think or you know? We know as I had tested. > >-- >Regards/Gruss, > Boris. > >https://people.kernel.org/tglx/notes-about-netiquette Thanks, Shiju
On Wed, May 08, 2024 at 04:59:18PM +0000, Shiju Jose wrote: > >> I think it happens only when a dependent module as autoloaded based on > >> a scrub device existing with exception of memory scrub control built > >> in and who would build this in? > > > >You think or you know? > We know as I had tested. Does this thing register successfully on a machine which doesn't have those devices?
>-----Original Message----- >From: Borislav Petkov <bp@alien8.de> >Sent: 08 May 2024 18:20 >To: Shiju Jose <shiju.jose@huawei.com> >Cc: linux-cxl@vger.kernel.org; linux-acpi@vger.kernel.org; linux- >mm@kvack.org; dan.j.williams@intel.com; dave@stgolabs.net; Jonathan >Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com; >alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com; >linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; david@redhat.com; >Vilas.Sridharan@amd.com; leo.duran@amd.com; Yazen.Ghannam@amd.com; >rientjes@google.com; jiaqiyan@google.com; tony.luck@intel.com; >Jon.Grimm@amd.com; dave.hansen@linux.intel.com; rafael@kernel.org; >lenb@kernel.org; naoya.horiguchi@nec.com; james.morse@arm.com; >jthoughton@google.com; somasundaram.a@hpe.com; >erdemaktas@google.com; pgonda@google.com; duenwen@google.com; >mike.malvestuto@intel.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; >Linuxarm <linuxarm@huawei.com> >Subject: Re: [RFC PATCH v8 01/10] ras: scrub: Add scrub subsystem > >On Wed, May 08, 2024 at 04:59:18PM +0000, Shiju Jose wrote: >> >> I think it happens only when a dependent module as autoloaded based >> >> on a scrub device existing with exception of memory scrub control >> >> built in and who would build this in? >> > >> >You think or you know? >> We know as I had tested. > >Does this thing register successfully on a machine which doesn't have those >devices? > I mean scrub subsystem module is not loaded and initialzed until a dependent device module is loaded and a device does not get registered with the scrub subsystem on a machine which doesn't have the corresponding scrub features. >-- >Regards/Gruss, > Boris. > >https://people.kernel.org/tglx/notes-about-netiquette Thanks, Shiju
On Wed, May 08, 2024 at 05:44:03PM +0000, Shiju Jose wrote: > I mean scrub subsystem module is not loaded and initialzed until > a dependent device module is loaded and a device does not get > registered with the scrub subsystem on a machine which doesn't have > the corresponding scrub features. Stop this rambling blabla please. This should *not* happen: # insmod ./memory_scrub.ko # echo $? 0 # lsmod Module Size Used by memory_scrub 12288 0 This is on a silly guest which has none of those dependent devices crap. Your scrub module should load only on a machine which has the hardware - not just for fun and on anything.
On Wed, 8 May 2024 21:25:46 +0200 Borislav Petkov <bp@alien8.de> wrote: > On Wed, May 08, 2024 at 05:44:03PM +0000, Shiju Jose wrote: > > I mean scrub subsystem module is not loaded and initialzed until > > a dependent device module is loaded and a device does not get > > registered with the scrub subsystem on a machine which doesn't have > > the corresponding scrub features. > > Stop this rambling blabla please. This should *not* happen: > > # insmod ./memory_scrub.ko > # echo $? > 0 > # lsmod > Module Size Used by > memory_scrub 12288 0 > > This is on a silly guest which has none of those dependent devices crap. > > Your scrub module should load only on a machine which has the hardware > - not just for fun and on anything. Fundamental question seems to be: Why should it not load? Shiju and I think it should, you think it shouldn't. Note this is only if someone deliberately ignores all the infrastructure intended to make sure only relevant modules probe and modprobe / insmod by hand. +CC some driver core folk and a few other subsystem maintainers who have subsystems doing the same as this one. Summary I think is: Borislav is asking for this new scrub subsystem core module to not successfully probe and call class_register() if it is manually inserted and there is no hardware on the particular system. It's a standard class type situation with core driver providing consistent ABI and /sys/class/ras/ with drivers hanging off various buses (currently ACPI and CXL) registering with that class. Many subsystem core drivers will probe and create subsystem specific sysfs directories on on systems that don't have any hardware needing drivers from that subsystem (if someone manually inserts them rather than relying on automatic module dependency handling.) I don't see why this class driver should be different and have to jump through hoops to satisfy this requirement. A quick look for callers of class_register() in their init functions found plenty of precedence. Many of the cases that don't do this are single use - i.e. class that only ever has one driver. There are even more if we take sysfs buses into account. (edac and IIO for example) A few examples of same handling of class registration. - input - that registers a lot more on class init, but sysfs class registration is in there. - hwmon - other than some quirk setup same as the scrub driver. Other than embedded systems with a custom build and kernel developers, who actually probes modules manually? Mostly people rely on modalias of the client drivers and them pulling in their dependencies. Modules are pretty pointless if you probe all the ones you've built whether or not the hardware is present. It would of course be easy to do the class_register() on first driver use but I'm not seeing a lot of precedence + the scrub class module would still insmod successfully. I think preventing load would be messy and complex at best. Jonathan
On May 9, 2024 11:19:39 AM GMT+02:00, Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote: >Many subsystem core drivers will probe and create subsystem specific >sysfs directories on on systems that don't have any hardware needing >drivers from that subsystem (if someone manually inserts them rather >than relying on automatic module dependency handling.) >I don't see why this class driver should be different and have to jump >through hoops to satisfy this requirement. You mean it should load because "Look ma, the others do it this way". Does it make any sense? Of course not. Are you arguing for the nonsensical "it should load" case because it is simply easier this way? How hard is that "jump through hoops" thing anyway? You mean it should load so that when booting an allmodconfig kernel there are not enough modules which are loading so lemme load one more. And then I need to go and rmmod them all before I need to do localmodconfig and build a tailored kernel for the machine. Or is there some other reason to load silly modules, use up resources for no good reason whatsoever and bloat the machine? You mean, f*ck it, right? Who cares... Geez.
On Thu, May 09, 2024 at 05:52:18PM +0200, Borislav Petkov wrote: > Are you arguing for the nonsensical "it should load" case because it > is simply easier this way? How hard is that "jump through hoops" thing > anyway? Let's see: the following patches add something called GET_SUPPORTED_FEATURES which is used to detect whether the system has patrol scrub functionality etc. Then there's ras2_acpi_init() which checks for a RAS2 ACPI table. Are you saying that checking for those two things in the init function is jumping through hoops?
Borislav Petkov wrote: > On Thu, May 09, 2024 at 05:52:18PM +0200, Borislav Petkov wrote: > > Are you arguing for the nonsensical "it should load" case because it > > is simply easier this way? How hard is that "jump through hoops" thing > > anyway? > > Let's see: the following patches add something called > GET_SUPPORTED_FEATURES which is used to detect whether the system has > patrol scrub functionality etc. > > Then there's ras2_acpi_init() which checks for a RAS2 ACPI table. > > Are you saying that checking for those two things in the init function > is jumping through hoops? Wait, this discussion has gone off the rails. Recall that there are 461 usages of module_pci_driver() in the kernel. Every one of those arranges for just registering a PCI driver when the module is loaded regardless of whether any devices that driver cares about are present. Consider the case of the PCI subsystem that allows dynamically updating the device ids that a driver attaches. I.e. echo $id > /sys/bus/pci/drivers/$driver/new_id ...the fundamentally does not work if the module unloads itself and the driver(s) it registers when the PCI bus core finds no devices to attach at runtime. The mitigation for keeping unneeded modules unloaded is that udev does not autoload modules unless a PCI bus udev event matches a module's published PCI device table. Don't want to waste module memory? Don't load the module.
shiju.jose@ wrote: > From: Shiju Jose <shiju.jose@huawei.com> > > Add scrub subsystem supports configuring the memory scrubbers > in the system. The scrub subsystem provides the interface for > registering the scrub devices. The scrub control attributes > are provided to the user in /sys/class/ras/rasX/scrub > > Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > Signed-off-by: Shiju Jose <shiju.jose@huawei.com> > --- > .../ABI/testing/sysfs-class-scrub-configure | 47 +++ > drivers/ras/Kconfig | 7 + > drivers/ras/Makefile | 1 + > drivers/ras/memory_scrub.c | 271 ++++++++++++++++++ > include/linux/memory_scrub.h | 37 +++ > 5 files changed, 363 insertions(+) > create mode 100644 Documentation/ABI/testing/sysfs-class-scrub-configure > create mode 100755 drivers/ras/memory_scrub.c > create mode 100755 include/linux/memory_scrub.h > > diff --git a/Documentation/ABI/testing/sysfs-class-scrub-configure b/Documentation/ABI/testing/sysfs-class-scrub-configure > new file mode 100644 > index 000000000000..3ed77dbb00ad > --- /dev/null > +++ b/Documentation/ABI/testing/sysfs-class-scrub-configure > @@ -0,0 +1,47 @@ > +What: /sys/class/ras/ > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + The ras/ class subdirectory belongs to the > + common ras features such as scrub subsystem. Why create "ras" class versus just a "srcub" class? I am otherwise not aware of a precedent for class device hierarchy. For example, on my system there is: /sys/class/ ├── scsi_device ├── scsi_disk ├── scsi_generic └── scsi_host ...not: /sys/class/scsi/ ├── device ├── disk ├── generic └── host > + > +What: /sys/class/ras/rasX/scrub/ > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + The /sys/class/ras/ras{0,1,2,3,...}/scrub directories > + correspond to each scrub device registered with the > + scrub subsystem. I notice there are some visibility rules in the code, but those expectations are not documented here. This documentation would also help developers writing new users of scrub_device_register(). > + > +What: /sys/class/ras/rasX/scrub/name > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RO) name of the memory scrubber > + > +What: /sys/class/ras/rasX/scrub/enable_background > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RW) Enable/Disable background(patrol) scrubbing if supported. > + > +What: /sys/class/ras/rasX/scrub/rate_available > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RO) Supported range for the scrub rate by the scrubber. > + The scrub rate represents in hours. > + > +What: /sys/class/ras/rasX/scrub/rate > +Date: March 2024 > +KernelVersion: 6.9 > +Contact: linux-kernel@vger.kernel.org > +Description: > + (RW) The scrub rate specified and it must be with in the > + supported range by the scrubber. > + The scrub rate represents in hours. > diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig > index fc4f4bb94a4c..181701479564 100644 > --- a/drivers/ras/Kconfig > +++ b/drivers/ras/Kconfig > @@ -46,4 +46,11 @@ config RAS_FMPM > Memory will be retired during boot time and run time depending on > platform-specific policies. > > +config SCRUB > + tristate "Memory scrub driver" > + help > + This option selects the memory scrub subsystem, supports > + configuring the parameters of underlying scrubbers in the > + system for the DRAM memories. > + > endif > diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile > index 11f95d59d397..89bcf0d84355 100644 > --- a/drivers/ras/Makefile > +++ b/drivers/ras/Makefile > @@ -2,6 +2,7 @@ > obj-$(CONFIG_RAS) += ras.o > obj-$(CONFIG_DEBUG_FS) += debugfs.o > obj-$(CONFIG_RAS_CEC) += cec.o > +obj-$(CONFIG_SCRUB) += memory_scrub.o > > obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o > obj-y += amd/atl/ > diff --git a/drivers/ras/memory_scrub.c b/drivers/ras/memory_scrub.c > new file mode 100755 > index 000000000000..7e995380ec3a > --- /dev/null > +++ b/drivers/ras/memory_scrub.c > @@ -0,0 +1,271 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Memory scrub subsystem supports configuring the registered > + * memory scrubbers. > + * > + * Copyright (c) 2024 HiSilicon Limited. > + */ > + > +#define pr_fmt(fmt) "MEM SCRUB: " fmt > + > +#include <linux/acpi.h> > +#include <linux/bitops.h> > +#include <linux/delay.h> > +#include <linux/kfifo.h> > +#include <linux/memory_scrub.h> > +#include <linux/platform_device.h> > +#include <linux/spinlock.h> > + > +/* memory scrubber config definitions */ > +#define SCRUB_ID_PREFIX "ras" > +#define SCRUB_ID_FORMAT SCRUB_ID_PREFIX "%d" > + > +static DEFINE_IDA(scrub_ida); > + > +struct scrub_device { > + int id; > + struct device dev; > + const struct scrub_ops *ops; > +}; > + > +#define to_scrub_device(d) container_of(d, struct scrub_device, dev) > +static ssize_t enable_background_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t len) > +{ > + struct scrub_device *scrub_dev = to_scrub_device(dev); > + bool enable; > + int ret; > + > + ret = kstrtobool(buf, &enable); > + if (ret < 0) > + return ret; > + > + ret = scrub_dev->ops->set_enabled_bg(dev, enable); > + if (ret) > + return ret; It strikes me as somewhat pointless to have such a thin sysfs implementation whose only job is to call down into a callback to do the work. Unless there are other consumers of 'struct scrub_ops' outside of these sysfs files why not just have the low-level drivers register their corresponding attributes themselves? Unless the functionality is truly generic just let the low-level driver be responsible for conforming to the sysfs ABI expectations, and, for example, each register their own "enable_background" attribute if they support that semantic. So scrub_device_register() would grow a 'const struct attribute_group **groups' argument, or something along those lines.
On Thu, May 09, 2024 at 02:21:28PM -0700, Dan Williams wrote: > Recall that there are 461 usages of module_pci_driver() in the kernel. > Every one of those arranges for just registering a PCI driver when the > module is loaded regardless of whether any devices that driver cares > about are present. Sorry, I read your text a bunch of times but I still have no clue what you're trying to tell me. All *I* am saying is since this is a new subsystem and the methods for detecting the scrub functionality are two - either an ACPI table or a GET_SUPPORTED_FEATURES command, then the init function of the subsystem: +static int __init memory_scrub_control_init(void) +{ + return class_register(&scrub_class); +} +subsys_initcall(memory_scrub_control_init); can check for those two things before initializing. If there is no scrubbing functionality, then it can return an error and not load. The same as when we don't load x86 drivers on the wrong vendor and so on. If the check is easy, why not do it? Make more sense?
Borislav Petkov wrote: > On Thu, May 09, 2024 at 02:21:28PM -0700, Dan Williams wrote: > > Recall that there are 461 usages of module_pci_driver() in the kernel. > > Every one of those arranges for just registering a PCI driver when the > > module is loaded regardless of whether any devices that driver cares > > about are present. > > Sorry, I read your text a bunch of times but I still have no clue what > you're trying to tell me. Because taking this proposal to its logical end of "if a simple check is possible, why not do it in module_init()" has wide implications like the module_pci_driver() example. > All *I* am saying is since this is a new subsystem and the methods for > detecting the scrub functionality are two - either an ACPI table or > a GET_SUPPORTED_FEATURES command, then the init function of the > subsystem: No, at a minimum that's a layering violation. This is a generic library facility that should not care if it is being called for a CXL device or an ACPI device. It also causes functional issues, see below: > +static int __init memory_scrub_control_init(void) > +{ > + return class_register(&scrub_class); > +} > +subsys_initcall(memory_scrub_control_init); > > can check for those two things before initializing. > > If there is no scrubbing functionality, then it can return an error and > not load. > > The same as when we don't load x86 drivers on the wrong vendor and so > on. I think it works for x86 drivers because the functionality in those modules is wholly contained within that one module. This scrub module is a service library for other modules. > If the check is easy, why not do it? It is functionally the wrong place to do the check. When module_init() fails it causes not only the current module to be unloaded but any dependent modules will also fail to load. Let's take an example of the CXL driver wanting to register with this scrub interface to support the capability that *might* be available on some CXL devices. The cxl_pci.ko module, that houses cxl_pci_driver, grows a call to scrub_device_register(). That scrub_device_register() call is statically present in cxl_pci.ko so that when cxl_pci.ko loads symbol resolution requires scrub.ko to load. Neither of those modules (cxl_pci.ko or scrub.ko) load automatically. Either udev loads cxl_pci.ko because it sees a device that matches cxl_mem_pci_tbl, or the user manually insmods those modules because they think they know better. No memory wasted unless the user explicitly asks for memory to be wasted. If no CXL devices in the system have scrub capabilities, great, then scrub_device_register() will never be called. Now, if memory_scrub_control_init() did its own awkward and redundant CXL scan, and fails with "no CXL scrub capable devices found" it would not only block scrub.ko from loading, but also cxl_pci.ko since cxl_pci.ko needs to resolve that symbol to load. All of that said, you are right that there is still a scenario where memory is wasted. I.e. the case where a subsystem like CXL or ACPI wants the runtime *option* of calling scrub_device_register(), but never does. That will inflict the cost of registering a vestigial scrub_class. That can be mitigated with another layer of module indirection where cxl_pci_driver registers a cxl_scrub_device and then a cxl_scrub_driver in its own module calls scrub_device_register() with the scrub core. I would entertain that extra indirection long before I would entertain memory_scrub_control_init() growing scrub device enumeration that belongs to the *caller* of scrub_device_register(). > Make more sense? It is a reasonable question, but all module libraries incur init costs just by being linked by another module. You can walk /sys/class to see how many other subsystems are registering class devices but never using them. I would not say "no" to a generic facility that patches out module dependencies until the first call, just not sure the return on investment would be worth it. Lastly I think drivers based on ACPI tables are awkward. They really need to have an ACPI device to attach so that typical automatic Linux module loading machinery can be used. The fact this function is a subsys_initcall() is a red-flag since nothing should be depending on the load order of a little driver to control scrub parameters.
On Thu, 9 May 2024 14:47:56 -0700 Dan Williams <dan.j.williams@intel.com> wrote: > shiju.jose@ wrote: > > From: Shiju Jose <shiju.jose@huawei.com> > > > > Add scrub subsystem supports configuring the memory scrubbers > > in the system. The scrub subsystem provides the interface for > > registering the scrub devices. The scrub control attributes > > are provided to the user in /sys/class/ras/rasX/scrub > > > > Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > > Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> > > Signed-off-by: Shiju Jose <shiju.jose@huawei.com> > > --- > > .../ABI/testing/sysfs-class-scrub-configure | 47 +++ > > drivers/ras/Kconfig | 7 + > > drivers/ras/Makefile | 1 + > > drivers/ras/memory_scrub.c | 271 ++++++++++++++++++ > > include/linux/memory_scrub.h | 37 +++ > > 5 files changed, 363 insertions(+) > > create mode 100644 Documentation/ABI/testing/sysfs-class-scrub-configure > > create mode 100755 drivers/ras/memory_scrub.c > > create mode 100755 include/linux/memory_scrub.h > > > > diff --git a/Documentation/ABI/testing/sysfs-class-scrub-configure b/Documentation/ABI/testing/sysfs-class-scrub-configure > > new file mode 100644 > > index 000000000000..3ed77dbb00ad > > --- /dev/null > > +++ b/Documentation/ABI/testing/sysfs-class-scrub-configure > > @@ -0,0 +1,47 @@ > > +What: /sys/class/ras/ > > +Date: March 2024 > > +KernelVersion: 6.9 > > +Contact: linux-kernel@vger.kernel.org > > +Description: > > + The ras/ class subdirectory belongs to the > > + common ras features such as scrub subsystem. Hi Dan, > > Why create "ras" class versus just a "srcub" class? I am otherwise not > aware of a precedent for class device hierarchy. For example, on my > system there is: I think that's miss described - aim is on subsystem, the first feature supported is scrub. Intent here is to group RAS features of a given device / interface etc into one place. This was a request in an review of an earlier version on basis these interfaces tend to get grouped together in a device. So options are /sys/class/ras/cxl_mem0/scrub/rate etc. /sys/class/ras/cxl_mem0/ecs/rate etc (maybe separate for ECS because it annoyingly looks nothing like scrub despite name and there are multiple impelmentations) vs /sys/class/ras/cxl_mem0_scrub /sys/class/ras/cxl_mem0_ecs etc Note that generic naming not including what the source was got negative reviews in favor of making that the device instance name here. So that rulled out simply /sys/class/ras/scrubX/ /sys/class/ras/ecsX/ I don't mind which way we go; both are extensible. > > /sys/class/ > ├── scsi_device > ├── scsi_disk > ├── scsi_generic > └── scsi_host > > ...not: > > /sys/class/scsi/ > ├── device > ├── disk > ├── generic > └── host That's a docs problem - this was never the intent. > > > > + > > +What: /sys/class/ras/rasX/scrub/ > > +Date: March 2024 > > +KernelVersion: 6.9 > > +Contact: linux-kernel@vger.kernel.org > > +Description: > > + The /sys/class/ras/ras{0,1,2,3,...}/scrub directories > > + correspond to each scrub device registered with the > > + scrub subsystem. > > I notice there are some visibility rules in the code, but those > expectations are not documented here. > > This documentation would also help developers writing new users of > scrub_device_register(). Agreed. One to improve. > > > + > > +What: /sys/class/ras/rasX/scrub/name > > +Date: March 2024 > > +KernelVersion: 6.9 > > +Contact: linux-kernel@vger.kernel.org > > +Description: > > + (RO) name of the memory scrubber > > + > > +What: /sys/class/ras/rasX/scrub/enable_background > > +Date: March 2024 > > +KernelVersion: 6.9 > > +Contact: linux-kernel@vger.kernel.org > > +Description: > > + (RW) Enable/Disable background(patrol) scrubbing if supported. > > + > > +What: /sys/class/ras/rasX/scrub/rate_available > > +Date: March 2024 > > +KernelVersion: 6.9 > > +Contact: linux-kernel@vger.kernel.org > > +Description: > > + (RO) Supported range for the scrub rate by the scrubber. > > + The scrub rate represents in hours. > > + > > +What: /sys/class/ras/rasX/scrub/rate > > +Date: March 2024 > > +KernelVersion: 6.9 > > +Contact: linux-kernel@vger.kernel.org > > +Description: > > + (RW) The scrub rate specified and it must be with in the > > + supported range by the scrubber. > > + The scrub rate represents in hours. > > diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig > > index fc4f4bb94a4c..181701479564 100644 > > --- a/drivers/ras/Kconfig > > +++ b/drivers/ras/Kconfig > > @@ -46,4 +46,11 @@ config RAS_FMPM > > Memory will be retired during boot time and run time depending on > > platform-specific policies. > > > > +config SCRUB > > + tristate "Memory scrub driver" > > + help > > + This option selects the memory scrub subsystem, supports > > + configuring the parameters of underlying scrubbers in the > > + system for the DRAM memories. > > + > > endif > > diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile > > index 11f95d59d397..89bcf0d84355 100644 > > --- a/drivers/ras/Makefile > > +++ b/drivers/ras/Makefile > > @@ -2,6 +2,7 @@ > > obj-$(CONFIG_RAS) += ras.o > > obj-$(CONFIG_DEBUG_FS) += debugfs.o > > obj-$(CONFIG_RAS_CEC) += cec.o > > +obj-$(CONFIG_SCRUB) += memory_scrub.o > > > > obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o > > obj-y += amd/atl/ > > diff --git a/drivers/ras/memory_scrub.c b/drivers/ras/memory_scrub.c > > new file mode 100755 > > index 000000000000..7e995380ec3a > > --- /dev/null > > +++ b/drivers/ras/memory_scrub.c > > @@ -0,0 +1,271 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * Memory scrub subsystem supports configuring the registered > > + * memory scrubbers. > > + * > > + * Copyright (c) 2024 HiSilicon Limited. > > + */ > > + > > +#define pr_fmt(fmt) "MEM SCRUB: " fmt > > + > > +#include <linux/acpi.h> > > +#include <linux/bitops.h> > > +#include <linux/delay.h> > > +#include <linux/kfifo.h> > > +#include <linux/memory_scrub.h> > > +#include <linux/platform_device.h> > > +#include <linux/spinlock.h> > > + > > +/* memory scrubber config definitions */ > > +#define SCRUB_ID_PREFIX "ras" > > +#define SCRUB_ID_FORMAT SCRUB_ID_PREFIX "%d" > > + > > +static DEFINE_IDA(scrub_ida); > > + > > +struct scrub_device { > > + int id; > > + struct device dev; > > + const struct scrub_ops *ops; > > +}; > > + > > +#define to_scrub_device(d) container_of(d, struct scrub_device, dev) > > +static ssize_t enable_background_store(struct device *dev, > > + struct device_attribute *attr, > > + const char *buf, size_t len) > > +{ > > + struct scrub_device *scrub_dev = to_scrub_device(dev); > > + bool enable; > > + int ret; > > + > > + ret = kstrtobool(buf, &enable); > > + if (ret < 0) > > + return ret; > > + > > + ret = scrub_dev->ops->set_enabled_bg(dev, enable); > > + if (ret) > > + return ret; > > It strikes me as somewhat pointless to have such a thin sysfs > implementation whose only job is to call down into a callback to do the > work. Unless there are other consumers of 'struct scrub_ops' outside of > these sysfs files why not just have the low-level drivers register their > corresponding attributes themselves? > > Unless the functionality is truly generic just let the low-level driver > be responsible for conforming to the sysfs ABI expectations, and, for > example, each register their own "enable_background" attribute if they > support that semantic. This was me pushing for this based on that approach having been a pain in subystems I've been involved with in the past. so I'll answer. Maybe if we think the number of scrub drivers remains very low we can rely on ABI review. However, it's painful. Everyone wants to add their own custom ABI, so every review consists of 'no that is isn't consistent' reviews. The callback schemes reduce that considerably. As someone with their name next to one of the largest sysfs ABIs in the kernel, maybe I'm projecting my pain points on this one. Note that this approach has failed for multiple similar simple subsystems in the past and they have migrated to a (mostly) tighter description for ABI simply because those constraints are useful. A fairly recent one maybe 8 years ago? Was hwmon. There are other advantages that may not yet apply here (in kernel interfaces are much easier, even if they are only occasionally used for a given subsystem), but my motivation in pushing Shiju this way was to lock down the userspace interface. > > So scrub_device_register() would grow a 'const struct attribute_group > **groups' argument, or something along those lines. Sure. Shiju had that in an earlier version. Personally I think it's an approach that may bite in the long run, but meh, maybe this will only ever have half a dozen drivers so it might remain manageable. If not, I love say 'I told you so' :) Jonathan
On Thu, May 09, 2024 at 03:59:29PM -0700, Dan Williams wrote: > No, at a minimum that's a layering violation. This is a generic library > facility that should not care if it is being called for a CXL device or > an ACPI device. Really? Because this looks like creating a subsystem for those two newfangled scrubbing functionalities which will be present in CXL devices and by that ACPI RAS2 thing. Because we have a *lot* of hw scrubbing functionality already. Just do: git grep "scrub" Some of it controls hw scrubbing. If this is a generic facility, does this mean that all those older scrubbers should be converted to it? Or is this thing going to support only new stuff? I.e., we want to disable our scrubbers when doing performance-sensitive workloads and reenable them after that but we don't care about old systems? And all that other bla about controlling scrubbers from userspace. So which is it? > I think it works for x86 drivers because the functionality in those > modules is wholly contained within that one module. This scrub module is > a service library for other modules. Well, you have that thing in EDAC. edac_core.ko is that service module and the chipset-specific drivers - at least on x86 - use a match_id to load only on the systems they should load on. If I had a BIOS table which had "EDAC" in it, I won't load edac_core.ko either but there isn't one. > It is functionally the wrong place to do the check. When module_init() > fails it causes not only the current module to be unloaded but any > dependent modules will also fail to load. See above. I'm under the assumption that this is using two methods to detect scrubbing functionality. > Let's take an example of the CXL driver wanting to register with this > scrub interface to support the capability that *might* be available on > some CXL devices. The cxl_pci.ko module, that houses cxl_pci_driver, > grows a call to scrub_device_register(). That scrub_device_register() > call is statically present in cxl_pci.ko so that when cxl_pci.ko loads > symbol resolution requires scrub.ko to load. > > Neither of those modules (cxl_pci.ko or scrub.ko) load automatically. > Either udev loads cxl_pci.ko because it sees a device that matches > cxl_mem_pci_tbl, or the user manually insmods those modules because they > think they know better. No memory wasted unless the user explicitly asks > for memory to be wasted. The scrub.ko goes and asks the system: "Do you have a CXL device with scrubbing functionality?" "Yes: load." "No: ok, won't." > If no CXL devices in the system have scrub capabilities, great, then > scrub_device_register() will never be called. > > Now, if memory_scrub_control_init() did its own awkward and redundant > CXL scan, and fails with "no CXL scrub capable devices found" it would > not only block scrub.ko from loading, but also cxl_pci.ko since > cxl_pci.ko needs to resolve that symbol to load. Why would it fail the scan? Isn't this fancy GET_SUPPORTED_FEATURES command giving you all info you need? > I would not say "no" to a generic facility that patches out module > dependencies until the first call, just not sure the return on > investment would be worth it. From the discussion so far yeah, I think you're right, it is not worth the effort. > Lastly I think drivers based on ACPI tables are awkward. They really > need to have an ACPI device to attach so that typical automatic Linux > module loading machinery can be used. The fact this function is a > subsys_initcall() is a red-flag since nothing should be depending on the > load order of a little driver to control scrub parameters. Yeah, it is becoming a mess before it has even started. So I don't mind if such drivers get loaded as long as doing this is the best we can do given the situation. What gets me up the palms, as they say in .de, is "just because" and "look, the others do it too." Thx.
> How hard is that "jump through hoops" thing anyway? I'd conservatively estimate 500 lines of duplicated code from the CXL subsystem just to handle the setup and discovery of the mailbox, plus all the checks needed to establish the device is in a state to reply. Also locking or module load ordering constraints because we need to ensure mutual exclusion on that mailbox between this module and the CXL core. So it would approximately triple the size of this driver to check for CXL scrub support. Not to mention hotplug - which could possibly be solved with appropriate udev rules to try loading this again whenever a CXL memory device gets plugged in. Alternative would be to make this ras class driver dependent on the CXL driver stack running first. Thus if you wanted RAS2 ACPI table support, you'd need to load a whole bunch of CXL stuff. Add another similar driver in future and we get another few 100 lines of code or another dependency. To me those numbers make it unsustainable. > > You mean it should load so that when booting an allmodconfig kernel there are not enough modules which are loading so lemme load one more. And then I need to go and rmmod them all before I need to do localmodconfig and build a tailored kernel for the machine. > > Or is there some other reason to load silly modules, use up resources for no good reason whatsoever and bloat the machine? As Dan, Shiju and I observed (and Shiju tested to be sure we weren't getting it wrong), normal setups including your allmodconfig build would not even load the driver. What are we missing? Jonathan
Borislav Petkov wrote: > On Thu, May 09, 2024 at 03:59:29PM -0700, Dan Williams wrote: > > No, at a minimum that's a layering violation. This is a generic library > > facility that should not care if it is being called for a CXL device or > > an ACPI device. > > Really? Yes. > Because this looks like creating a subsystem for those two newfangled > scrubbing functionalities which will be present in CXL devices and > by that ACPI RAS2 thing. > > Because we have a *lot* of hw scrubbing functionality already. Just do: > > git grep "scrub" > > Some of it controls hw scrubbing. If this is a generic facility, does > this mean that all those older scrubbers should be converted to it? > > Or is this thing going to support only new stuff? I.e., we want to > disable our scrubbers when doing performance-sensitive workloads and > reenable them after that but we don't care about old systems? > > And all that other bla about controlling scrubbers from userspace. > > So which is it? In fact this question matches my reaction to the last posting [1], and led to a much improved cover letter and the "Comparison of scrubbing features". To your point there are scrub capabilities already in the kernel and we would need to make a decision about what to do about them. I called out NVDIMM ARS as one example and am open to exploring converting that to the common scrub ABI, but not block this proposal in the meantime. For me the proposal can be boiled down to, "here we (kernel community) are again with 2 new scrub interfaces to add to the kernel. Lets step back, define a common ABI for ACPI RAS 2 and CXL to stop the proliferation of scrub ABIs, and then make a decision about when/whether to integrate legacy scrub facilities into this new interface". [1]: http://lore.kernel.org/r/65d6936952764_1138c7294e@dwillia2-xfh.jf.intel.com.notmuch > > I think it works for x86 drivers because the functionality in those > > modules is wholly contained within that one module. This scrub module is > > a service library for other modules. > > Well, you have that thing in EDAC. edac_core.ko is that service module > and the chipset-specific drivers - at least on x86 - use a match_id to > load only on the systems they should load on. Which is exactly the same mechanism being defined here. scrub_core.ko is a service module that would only be requested by an ACPI module or a CXL module after one of those loads based on their match_id. > If I had a BIOS table which had "EDAC" in it, I won't load edac_core.ko > either but there isn't one. > > > It is functionally the wrong place to do the check. When module_init() > > fails it causes not only the current module to be unloaded but any > > dependent modules will also fail to load. > > See above. I'm under the assumption that this is using two methods to > detect scrubbing functionality. The scrub_core, like edac_core, has no method to detect scrubbing facility, it is simply a passive library waiting for the first scrub_device_register() call. > > Let's take an example of the CXL driver wanting to register with this > > scrub interface to support the capability that *might* be available on > > some CXL devices. The cxl_pci.ko module, that houses cxl_pci_driver, > > grows a call to scrub_device_register(). That scrub_device_register() > > call is statically present in cxl_pci.ko so that when cxl_pci.ko loads > > symbol resolution requires scrub.ko to load. > > > > Neither of those modules (cxl_pci.ko or scrub.ko) load automatically. > > Either udev loads cxl_pci.ko because it sees a device that matches > > cxl_mem_pci_tbl, or the user manually insmods those modules because they > > think they know better. No memory wasted unless the user explicitly asks > > for memory to be wasted. > > The scrub.ko goes and asks the system: "Do you have a CXL device with > scrubbing functionality?" "Yes: load." "No: ok, won't." Yeah, that's backwards. CXL enumeration belongs in the CXL driver and the CXL driver is fully responsible for deciding when to incur the costs of loading scrub_core. > > If no CXL devices in the system have scrub capabilities, great, then > > scrub_device_register() will never be called. > > > > Now, if memory_scrub_control_init() did its own awkward and redundant > > CXL scan, and fails with "no CXL scrub capable devices found" it would > > not only block scrub.ko from loading, but also cxl_pci.ko since > > cxl_pci.ko needs to resolve that symbol to load. > > Why would it fail the scan? cxl_pci.ko loads based on match_id and cxl_pci_probe() enumerates device capabilities. My interpretation of your feedback is that memory_scrub_control_init() should duplicate that cxl_pci_probe() enumeration? Assume that it does and memory_scrub_control_init() finds no scrub facilities in any CXL devices and fails memory_scrub_control_init(). Any module that links to scrub_device_register() will also fail to load because module symbol resolution depends on all modules completing init. > Isn't this fancy GET_SUPPORTED_FEATURES command giving you all info you > need? Sure, but that's a driver-probe-time facility, not a module_init-time facility. > > Lastly I think drivers based on ACPI tables are awkward. They really > > need to have an ACPI device to attach so that typical automatic Linux > > module loading machinery can be used. The fact this function is a > > subsys_initcall() is a red-flag since nothing should be depending on the > > load order of a little driver to control scrub parameters. > > Yeah, it is becoming a mess before it has even started. I assume you do not consider edac_core a mess? Now, the question of how many legacy scrub interfaces should be considered in this design out of the gate is a worthwhile discussion. I am encouraged that this ABI is at least trying to handle more than 1 backend, which makes me feel better that adding a 3rd and 4th might not be prohibitive. > So I don't mind if such drivers get loaded as long as doing this is the > best we can do given the situation. What gets me up the palms, as they > say in .de, is "just because" and "look, the others do it too." Which matches what I reacted to on the last posting: "Maybe it is self evident to others, but for me there is little in these changelogs besides 'mechanism exists, enable it'" ...and to me that feedback was taken to heart with much improved changelogs in this new posting. This init time feature probing discussion feels like it was born from a micommunication / misunderstanding.
On Fri, May 10, 2024 at 10:13:41AM -0700, Dan Williams wrote: > In fact this question matches my reaction to the last posting [1], and > led to a much improved cover letter and the "Comparison of scrubbing > features". To your point there are scrub capabilities already in the > kernel and we would need to make a decision about what to do about them. The answer to that question is whether this new userspace usage is going to want to control those too. So "Use case of scrub control feature" from the cover letter is giving two short sentences about what one would do but I'm still meh. A whole subsystem needing a bunch of effort would need a lot more justification. So can anyone please elaborate more on the use cases and why this new thing is needed? > I called out NVDIMM ARS as one example and am open to exploring > converting that to the common scrub ABI, but not block this proposal > in the meantime. > > For me the proposal can be boiled down to, "here we (kernel community) > are again with 2 new scrub interfaces to add to the kernel. Lets step > back, define a common ABI for ACPI RAS 2 and CXL to stop the > proliferation of scrub ABIs, and then make a decision about when/whether > to integrate legacy scrub facilities into this new interface". Fully agreed as long as there's valid users for it and we don't end up designing and supporting an interface which people are not sure if anyone uses. ras_userspace_consumers() from the other thread case-in-point. > [1]: http://lore.kernel.org/r/65d6936952764_1138c7294e@dwillia2-xfh.jf.intel.com.notmuch ^^^^^ Ha, you're speaking what I'm thinking here. :-) > The scrub_core, like edac_core, has no method to detect scrubbing > facility, it is simply a passive library waiting for the first > scrub_device_register() call. Well, those scrub things still have methods which are better than nothing. EDAC is ancient. But ok, let's just say they're the same for the sake of simplicity. > Yeah, that's backwards. CXL enumeration belongs in the CXL driver and > the CXL driver is fully responsible for deciding when to incur the costs > of loading scrub_core. Ok, fair enough. > Assume that it does and memory_scrub_control_init() finds no scrub > facilities in any CXL devices and fails memory_scrub_control_init(). Any > module that links to scrub_device_register() will also fail to load > because module symbol resolution depends on all modules completing init. My angle was: scan the system for *all* possible scrub functionalities and if none present, then fail. And since they're only two... > Sure, but that's a driver-probe-time facility, not a module_init-time > facility. Oh well. > I assume you do not consider edac_core a mess? The whole EDAC is a mess but that's a whole another story. :-) > Now, the question of how many legacy scrub interfaces should be > considered in this design out of the gate is a worthwhile discussion. I > am encouraged that this ABI is at least trying to handle more than 1 > backend, which makes me feel better that adding a 3rd and 4th might not > be prohibitive. See above. I'm perfectly fine with: "hey, we have a new scrub API interfacing to RAS scrub capability and it is *the* thing to use and all other hw scrub functionality should be shoehorned into it. So this thing's design should at least try to anticipate supporting other scrub hw. Because there's EDAC too. Why isn't this scrub thing part of EDAC? Why isn't this scrub API part of edac_core? I mean, this is all RAS so why design a whole new thing when the required glue is already there? We can just as well have a /sys/devices/system/edac/scrub/ node hierarchy and have everything there. Why does it have to be yet another thing? And if it needs to be separate, who's going to maintain it? > Which matches what I reacted to on the last posting: > > "Maybe it is self evident to others, but for me there is little in these > changelogs besides 'mechanism exists, enable it'" > > ...and to me that feedback was taken to heart with much improved > changelogs in this new posting. Ok. > This init time feature probing discussion feels like it was born from a > micommunication / misunderstanding. Yes, it seems so, thanks for clarifying things. I still am unclear on the usecases and how this is supposed to be used and also, as mentioned above, we have a *lot* of RAS functionality spread around the kernel. Perhaps we should start unifying it instead of adding more... So the big picture and where we're headed to, needs to be clarified first. Thx.
Focusing on just one bit. > > Now, the question of how many legacy scrub interfaces should be > > considered in this design out of the gate is a worthwhile discussion. I > > am encouraged that this ABI is at least trying to handle more than 1 > > backend, which makes me feel better that adding a 3rd and 4th might not > > be prohibitive. > > See above. > > I'm perfectly fine with: "hey, we have a new scrub API interfacing to > RAS scrub capability and it is *the* thing to use and all other hw scrub > functionality should be shoehorned into it. > > So this thing's design should at least try to anticipate supporting > other scrub hw. > > Because there's EDAC too. Why isn't this scrub thing part of EDAC? Why > isn't this scrub API part of edac_core? I mean, this is all RAS so why > design a whole new thing when the required glue is already there? > > We can just as well have a > > /sys/devices/system/edac/scrub/ > > node hierarchy and have everything there. A few questions about this. It seems an unusual use fake devices and a bus so I'm trying to understand how we might do something that looks more standard but perhaps also fit within the existing scheme. I appreciate this stuff has evolved over a long time, so lots of backwards compatibility concerns. If I follow this right the current situation is: /sys/devices/system/edac is the 'virtual' device registered on the edac bus. > > Why does it have to be yet another thing? > > And if it needs to be separate, who's going to maintain it? > > > Which matches what I reacted to on the last posting: > > > > "Maybe it is self evident to others, but for me there is little in these > > changelogs besides 'mechanism exists, enable it'" > > > > ...and to me that feedback was taken to heart with much improved > > changelogs in this new posting. > > Ok. > > > This init time feature probing discussion feels like it was born from a > > micommunication / misunderstanding. > > Yes, it seems so, thanks for clarifying things. > > I still am unclear on the usecases and how this is supposed to be used > and also, as mentioned above, we have a *lot* of RAS functionality > spread around the kernel. Perhaps we should start unifying it instead of > adding more... > > So the big picture and where we're headed to, needs to be clarified first. > > Thx. >
On Fri, 17 May 2024 12:15:54 +0100 Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote: > Focusing on just one bit. > > > > Now, the question of how many legacy scrub interfaces should be > > > considered in this design out of the gate is a worthwhile discussion. I > > > am encouraged that this ABI is at least trying to handle more than 1 > > > backend, which makes me feel better that adding a 3rd and 4th might not > > > be prohibitive. > > > > See above. > > > > I'm perfectly fine with: "hey, we have a new scrub API interfacing to > > RAS scrub capability and it is *the* thing to use and all other hw scrub > > functionality should be shoehorned into it. > > > > So this thing's design should at least try to anticipate supporting > > other scrub hw. > > > > Because there's EDAC too. Why isn't this scrub thing part of EDAC? Why > > isn't this scrub API part of edac_core? I mean, this is all RAS so why > > design a whole new thing when the required glue is already there? > > > > We can just as well have a > > > > /sys/devices/system/edac/scrub/ > > > > node hierarchy and have everything there. Sorry - finger fumble, wasn't meant to send yet :( > > A few questions about this. It seems an unusual use of fake devices and a bus > so I'm trying to understand how we might do something that looks more standard > but perhaps also fit within the existing scheme. I appreciate this stuff > has evolved over a long time, so lots of backwards compatibility concerns. > > If I follow this right the current situation is: > > /sys/devices/system/edac is the 'virtual' device registered on the edac bus. Actually that's wrong it's not on the edac bus as that is the bus registered via subsys_system_register() (which does create a fake device as per the docs telling us not to use it any more - fair enough, legacy). The mc below it is a bare device - I think just to provide a directory? The comment on the release function seems to say that. This gives. /sys/devices/system/edac/mc /sys/bus/edac/devices/mc Under that we have individual mc0/mc1 etc for the instances of that accessible via /sys/devices/system/edac/mc/mc0 /sys/bus/edac/device/mc/mc0 Those are registered a children of mc. I'd have expected them to be children of the device that registered them - so for our case, a CXL mc0 node would be child of the CXL device rather than here but again I'm guessing legacy that had to be maintained. In general this nesting seems unusual, as I'd have expected the registration directly on the edac bus with /sys/bus/edac/device/mc0 /sys/bus/edac/device/pci0 Given we are talking about something new, maybe this is an opportunity to not perpetuate this? If we add scrub in here I'd prefer to just use the normal bus registration handling rather than creating a nest of additional nodes. So perhaps we could consider /sys/bus/edac/device/scrub0 (or whatever name makes sense, as per the earlier discussion of cxl_scrub0 or similar). Could consider moving the bus location of mc0 etc in future to there with symlinks to /sys/bus/edac/device/mc/* for backwards compatibility either via setting their parents or more explicit link creation. These scrub0 would have their dev->parent set to who ever actually registered them providing that reference cleanly and letting all the normal device model stuff work more simply. If we did that with the scrub nodes, the only substantial change from a separate subsystem as seen in this patch set would be to register them on the edac bus rather than a separate class. As you pointed out, there is a simple scrub interface in the existing edac memory controller code. How would you suggest handling that? Have them all register an additional device on the bus (as a child of the mcX devices) perhaps? Seems an easy step forwards and should be no backwards compatibility concerns. > > > > > Why does it have to be yet another thing? It absolutely doesn't as long as we can do it fairly cleanly within existing code. I wasn't sure that was possible, but you know edac a lot better than me and so I'll defer to you on that! > > > > And if it needs to be separate, who's going to maintain it? Several options for that, but fair question - bringing (at least some of) the RAS mess together will focus reviewer bandwidth etc better. > > > > > Which matches what I reacted to on the last posting: > > > > > > "Maybe it is self evident to others, but for me there is little in these > > > changelogs besides 'mechanism exists, enable it'" > > > > > > ...and to me that feedback was taken to heart with much improved > > > changelogs in this new posting. > > > > Ok. > > > > > This init time feature probing discussion feels like it was born from a > > > micommunication / misunderstanding. > > > > Yes, it seems so, thanks for clarifying things. > > > > I still am unclear on the usecases and how this is supposed to be used > > and also, as mentioned above, we have a *lot* of RAS functionality > > spread around the kernel. Perhaps we should start unifying it instead of > > adding more... I'm definitely keen on unifying things as I agree, this mixture of different RAS functionality is a ever worsening mess. Jonathan > > > > So the big picture and where we're headed to, needs to be clarified first. > > > > Thx. > > >
>-----Original Message----- >From: Borislav Petkov <bp@alien8.de> >Sent: 11 May 2024 11:17 >To: Dan Williams <dan.j.williams@intel.com> >Cc: Jonathan Cameron <jonathan.cameron@huawei.com>; Shiju Jose ><shiju.jose@huawei.com>; linux-cxl@vger.kernel.org; linux- >acpi@vger.kernel.org; linux-mm@kvack.org; dave@stgolabs.net; >dave.jiang@intel.com; alison.schofield@intel.com; vishal.l.verma@intel.com; >ira.weiny@intel.com; linux-edac@vger.kernel.org; linux- >kernel@vger.kernel.org; david@redhat.com; Vilas.Sridharan@amd.com; >leo.duran@amd.com; Yazen.Ghannam@amd.com; rientjes@google.com; >jiaqiyan@google.com; tony.luck@intel.com; Jon.Grimm@amd.com; >dave.hansen@linux.intel.com; rafael@kernel.org; lenb@kernel.org; >naoya.horiguchi@nec.com; james.morse@arm.com; jthoughton@google.com; >somasundaram.a@hpe.com; erdemaktas@google.com; pgonda@google.com; >duenwen@google.com; mike.malvestuto@intel.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; >Linuxarm <linuxarm@huawei.com>; Greg Kroah-Hartman ><gregkh@linuxfoundation.org>; Jean Delvare <jdelvare@suse.com>; Guenter >Roeck <linux@roeck-us.net>; Dmitry Torokhov <dmitry.torokhov@gmail.com> >Subject: Re: [RFC PATCH v8 01/10] ras: scrub: Add scrub subsystem > >On Fri, May 10, 2024 at 10:13:41AM -0700, Dan Williams wrote: >> In fact this question matches my reaction to the last posting [1], and >> led to a much improved cover letter and the "Comparison of scrubbing >> features". To your point there are scrub capabilities already in the >> kernel and we would need to make a decision about what to do about them. > >The answer to that question is whether this new userspace usage is going to >want to control those too. > >So > >"Use case of scrub control feature" > >from the cover letter is giving two short sentences about what one would do but >I'm still meh. A whole subsystem needing a bunch of effort would need a lot >more justification. > >So can anyone please elaborate more on the use cases and why this new thing is >needed? Following are some of the use cases of generic scrub control subsystem as given in the cover letter. Request please add any other use cases, which I missed. 1. There are several types of interfaces to HW memory scrubbers identified such as ACPI NVDIMM ARS(Address Range Scrub), CXL memory device patrol scrub, CXL DDR5 ECS, ACPI RAS2 memory scrubbing features and software based memory scrubber(discussed in the community Reference [5] in the cover letter). Also some scrubbers support controlling (background) patrol scrubbing(ACPI RAS2, CXL) and/or on-demand scrubbing(ACPI RAS2, ACPI ARS). However the scrub controls varies between memory scrubbers. Thus there is a need for a standard generic ABI and sysfs scrub controls for the userspace tools, which control HW and SW scrubbers in the system, for the easiness of use. 2. Scrub controls in user space allow the user space tool to disable and enable the feature in case disabling of the background patrol scrubbing and changing the scrub rate are needed for other purposes such as performance-aware operations which requires the background operations to be turned off or reduced. 3. Allows to perform on-demand scrubbing for specific address range if supported by the scrubber. 4. User space tools controls scrub the memory DIMMs regularly at a configurable scrub rate using the sysfs scrub controls discussed help, - to detect uncorrectable memory errors early before user accessing memory, which helps to recover the detected memory errors. - reduces the chance of a correctable error becoming uncorrectable. Regards, Shiju
On Mon, 20 May 2024 11:54:50 +0100 Shiju Jose <shiju.jose@huawei.com> wrote: > >-----Original Message----- > >From: Borislav Petkov <bp@alien8.de> > >Sent: 11 May 2024 11:17 > >To: Dan Williams <dan.j.williams@intel.com> > >Cc: Jonathan Cameron <jonathan.cameron@huawei.com>; Shiju Jose > ><shiju.jose@huawei.com>; linux-cxl@vger.kernel.org; linux- > >acpi@vger.kernel.org; linux-mm@kvack.org; dave@stgolabs.net; > >dave.jiang@intel.com; alison.schofield@intel.com; vishal.l.verma@intel.com; > >ira.weiny@intel.com; linux-edac@vger.kernel.org; linux- > >kernel@vger.kernel.org; david@redhat.com; Vilas.Sridharan@amd.com; > >leo.duran@amd.com; Yazen.Ghannam@amd.com; rientjes@google.com; > >jiaqiyan@google.com; tony.luck@intel.com; Jon.Grimm@amd.com; > >dave.hansen@linux.intel.com; rafael@kernel.org; lenb@kernel.org; > >naoya.horiguchi@nec.com; james.morse@arm.com; jthoughton@google.com; > >somasundaram.a@hpe.com; erdemaktas@google.com; pgonda@google.com; > >duenwen@google.com; mike.malvestuto@intel.com; gthelen@google.com; > >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; > >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei > ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; > >kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; > >Linuxarm <linuxarm@huawei.com>; Greg Kroah-Hartman > ><gregkh@linuxfoundation.org>; Jean Delvare <jdelvare@suse.com>; Guenter > >Roeck <linux@roeck-us.net>; Dmitry Torokhov <dmitry.torokhov@gmail.com> > >Subject: Re: [RFC PATCH v8 01/10] ras: scrub: Add scrub subsystem > > > >On Fri, May 10, 2024 at 10:13:41AM -0700, Dan Williams wrote: > >> In fact this question matches my reaction to the last posting [1], and > >> led to a much improved cover letter and the "Comparison of scrubbing > >> features". To your point there are scrub capabilities already in the > >> kernel and we would need to make a decision about what to do about them. > > > >The answer to that question is whether this new userspace usage is going to > >want to control those too. > > > >So > > > >"Use case of scrub control feature" > > > >from the cover letter is giving two short sentences about what one would do but > >I'm still meh. A whole subsystem needing a bunch of effort would need a lot > >more justification. > > > >So can anyone please elaborate more on the use cases and why this new thing is > >needed? > > Following are some of the use cases of generic scrub control subsystem as given in the cover letter. > Request please add any other use cases, which I missed. > > 1. There are several types of interfaces to HW memory scrubbers identified such as ACPI NVDIMM ARS(Address Range Scrub), CXL memory device patrol scrub, CXL DDR5 ECS, ACPI RAS2 memory scrubbing features and software based memory scrubber(discussed in the community Reference [5] in the cover letter). Also some scrubbers support controlling (background) patrol scrubbing(ACPI RAS2, CXL) and/or on-demand scrubbing(ACPI RAS2, ACPI ARS). However the scrub controls varies between memory scrubbers. Thus there is a need for a standard generic ABI and sysfs scrub controls for the userspace tools, which control HW and SW scrubbers in the system, for the easiness of use. > 2. Scrub controls in user space allow the user space tool to disable and enable the feature in case disabling of the background patrol scrubbing and changing the scrub rate are needed for other purposes such as performance-aware operations which requires the background operations to be turned off or reduced. > 3. Allows to perform on-demand scrubbing for specific address range if supported by the scrubber. > 4. User space tools controls scrub the memory DIMMs regularly at a configurable scrub rate using the sysfs scrub controls discussed help, > - to detect uncorrectable memory errors early before user accessing memory, which helps to recover the detected memory errors. > - reduces the chance of a correctable error becoming uncorrectable. Just to add one more reason a user space interface is needed. 5. Policy control for hotplugged memory. There is not necessarily a system wide bios or similar in the loop to control the scrub settings on a CXL device that wasn't there at boot. What that setting should be is a policy decision as we are trading of reliability vs performance - hence it should be in control of userspace. As such, 'an' interface is needed. Seems more sensible to try and unify it with other similar interfaces than spin yet another one. > > Regards, > Shiju >
On Fri, May 17, 2024 at 12:44:18PM +0100, Jonathan Cameron wrote: > Given we are talking about something new, maybe this is an opportunity > to not perpetuate this? > > If we add scrub in here I'd prefer to just use the normal bus registration > handling rather than creating a nest of additional nodes. So perhaps we > could consider > /sys/bus/edac/device/scrub0 (or whatever name makes sense, as per the > earlier discussion of cxl_scrub0 or similar). Yes, my main worry is how this RAS functionality is going to be all organized in the tree. Yes, EDAC legacy methods can die but the user-visible part can't so we might as well use it to concentrate stuff there. > Could consider moving the bus location of mc0 etc in future to there with > symlinks to /sys/bus/edac/device/mc/* for backwards compatibility either > via setting their parents or more explicit link creation. You can ignore the mc - that's the memory controller representation EDAC does and that's also kind of semi-legacy considering how heterogeneous devices are becoming. Nowadays, scrubbing functionality can be on anything that has memory and that's not only a memory controller. So it would actually be the better thing to abstract that differently and use .../edac/device/ for the different RAS functionalities. I.e., have the "device" organize it all. > These scrub0 would have their dev->parent set to who ever actually > registered them providing that reference cleanly and letting all the > normal device model stuff work more simply. Ack. > If we did that with the scrub nodes, the only substantial change from > a separate subsystem as seen in this patch set would be to register > them on the edac bus rather than a separate class. > > As you pointed out, there is a simple scrub interface in the existing > edac memory controller code. How would you suggest handling that? > Have them all register an additional device on the bus (as a child > of the mcX devices) perhaps? Seems an easy step forwards and should > be no backwards compatibility concerns. Well, you guys want to control that scrubbing from userspace and those old things probably do not fit that model? We could just not convert them for now and add them later if really needed. I.e., leave sleeping dogs lie. > It absolutely doesn't as long as we can do it fairly cleanly within > existing code. I wasn't sure that was possible, but you know edac > a lot better than me and so I'll defer to you on that! Meh, I'm simply maintaining it because no one else wants to. :) > Several options for that, but fair question - bringing (at least some of) > the RAS mess together will focus reviewer bandwidth etc better. Review is more than appreciated, as always. > I'm definitely keen on unifying things as I agree, this mixture of different > RAS functionality is a ever worsening mess. Yap, it needs to be unified and reigned into something more user-friendly and manageable. Thx.
On Tue, 21 May 2024 10:06:21 +0200 Borislav Petkov <bp@alien8.de> wrote: > On Fri, May 17, 2024 at 12:44:18PM +0100, Jonathan Cameron wrote: > > Given we are talking about something new, maybe this is an opportunity > > to not perpetuate this? > > > > If we add scrub in here I'd prefer to just use the normal bus registration > > handling rather than creating a nest of additional nodes. So perhaps we > > could consider > > /sys/bus/edac/device/scrub0 (or whatever name makes sense, as per the > > earlier discussion of cxl_scrub0 or similar). > > Yes, my main worry is how this RAS functionality is going to be all > organized in the tree. Yes, EDAC legacy methods can die but the > user-visible part can't so we might as well use it to concentrate stuff > there. Understood. > > > Could consider moving the bus location of mc0 etc in future to there with > > symlinks to /sys/bus/edac/device/mc/* for backwards compatibility either > > via setting their parents or more explicit link creation. > > You can ignore the mc - that's the memory controller representation EDAC > does and that's also kind of semi-legacy considering how heterogeneous > devices are becoming. Nowadays, scrubbing functionality can be on > anything that has memory and that's not only a memory controller. > > So it would actually be the better thing to abstract that differently > and use .../edac/device/ for the different RAS functionalities. I.e., > have the "device" organize it all. I'm not sure I follow this. Definitely worth ensuring we are thinking the same thing wrt to layout before we go further, Do you mean keep it similar to the existing device/mc device/pci structure so /sys/bus/edac/devices/scrub/cxl_mem0_scrub etc? This would rely on symlinks to paper over the dev->parent not being the normal parent. Hence would be similar to /sys/bus/edac/devices/pci in edac_pci_create_sysfs() or equivalent in edac_device_create_sysfs(). Or is the ../edac/device bit about putting an extra device under edac/devices/? e.g. /sys/bus/edac/devices/cxl_memX/scrub /sys/bus/edac/devices/cxl_memX/other_ras_thing which would be fairly standard driver model stuff. This would sit alongside 'legacy' /sys/bus/edac/devices/mc/mcX /sys/bus/edac/devices/pci/pciX etc I'd prefer this second model as it's very standard and but grouping is per providing parent device, rather than functionality. However, it is rather different from the existing edac structure. Where I've used the symlink approach in the past, it has always been about keeping a legacy interface in place, not where I'd start with something new. Hence I think this is a question of how far we 'breakaway' from existing edac structure. > > > These scrub0 would have their dev->parent set to who ever actually > > registered them providing that reference cleanly and letting all the > > normal device model stuff work more simply. > > Ack. This suggests the second option above, but I wanted to confirm as Shiju and I read this differently. > > > If we did that with the scrub nodes, the only substantial change from > > a separate subsystem as seen in this patch set would be to register > > them on the edac bus rather than a separate class. > > > > As you pointed out, there is a simple scrub interface in the existing > > edac memory controller code. How would you suggest handling that? > > Have them all register an additional device on the bus (as a child > > of the mcX devices) perhaps? Seems an easy step forwards and should > > be no backwards compatibility concerns. > > Well, you guys want to control that scrubbing from userspace and those > old things probably do not fit that model? We could just not convert > them for now and add them later if really needed. I.e., leave sleeping > dogs lie. Ok. There is an existing is the minimal sysfs existing interface but I'm fine with ignoring it for now. > > > It absolutely doesn't as long as we can do it fairly cleanly within > > existing code. I wasn't sure that was possible, but you know edac > > a lot better than me and so I'll defer to you on that! > > Meh, I'm simply maintaining it because no one else wants to. :) *much sympathy!* As we ramp up more on this stuff, we'll try and help out where we can. > > > Several options for that, but fair question - bringing (at least some of) > > the RAS mess together will focus reviewer bandwidth etc better. > > Review is more than appreciated, as always. > > > I'm definitely keen on unifying things as I agree, this mixture of different > > RAS functionality is a ever worsening mess. > > Yap, it needs to be unified and reigned into something more > user-friendly and manageable. Hopefully we all agree on a unified solution being the target. Feels like we are converging. Now we are down to the details :) Thanks, Jonathan > > Thx. >
On Wed, May 22, 2024 at 10:40:17AM +0100, Jonathan Cameron wrote: > Where I've used the symlink approach in the past, it has always > been about keeping a legacy interface in place, not where I'd start > with something new. Hence I think this is a question of how far > we 'breakaway' from existing edac structure. Yes, since we're designing this anew, we can do whatever we prefer. So let's do the standard driver model stuff. However, there's also /sys/devices/system/edac/ so I don't know what the right thing to do there is. Are we supposed to put everything under /sys/bus/edac now and /sys/devices/system/edac is wrong now and should go away? Maybe we should talk to Greg first ... :) > This suggests the second option above, but I wanted to confirm as Shiju > and I read this differently. As said, we wanna do the new correct way of how a sysfs interface should look and leave the old one as it is. > Ok. There is an existing is the minimal sysfs existing interface but I'm > fine with ignoring it for now. Yes, we don't know who's even using it and why. That's why I'm very interested in how this new thing is going to be used so that we know what we're committing to supporting forever. > *much sympathy!* As we ramp up more on this stuff, we'll try and > help out where we can. Always appreciated! :-) > Hopefully we all agree on a unified solution being the target. > > Feels like we are converging. Now we are down to the details :) Yap. Thanks!
On Mon, May 20, 2024 at 12:58:57PM +0100, Jonathan Cameron wrote: > > Following are some of the use cases of generic scrub control > > subsystem as given in the cover letter. Request please add any > > other use cases, which I missed. > > > > 1. There are several types of interfaces to HW memory scrubbers > > identified such as ACPI NVDIMM ARS(Address Range Scrub), CXL > > memory device patrol scrub, CXL DDR5 ECS, ACPI RAS2 memory > > scrubbing features and software based memory scrubber(discussed > > in the community Reference [5] in the cover letter). Also some > > scrubbers support controlling (background) patrol scrubbing(ACPI > > RAS2, CXL) and/or on-demand scrubbing(ACPI RAS2, ACPI ARS). > > However the scrub controls varies between memory scrubbers. Thus > > there is a need for a standard generic ABI and sysfs scrub > > controls for the userspace tools, which control HW and SW > > scrubbers in the system, for the easiness of use. This is all talking about what hw functionality there is. I'm more interested in the "there is a need" thing. What need? How? In order to support something like this upstream, I'd like to know how it is going to be used and whether the major use cases are covered. So that everyone can benefit from it - not only your employer. > > 2. Scrub controls in user space allow the user space tool to disable > > and enable the feature in case disabling of the background patrol > > scrubbing and changing the scrub rate are needed for other > > purposes such as performance-aware operations which requires the > > background operations to be turned off or reduced. Who's going to use those scrub controls? Tools? Admins? Scripts? > > 3. Allows to perform on-demand scrubbing for specific address range > > if supported by the scrubber. > > 4. User space tools controls scrub the memory DIMMs regularly at > > a configurable scrub rate using the sysfs scrub controls > > discussed help, - to detect uncorrectable memory errors early > > before user accessing memory, which helps to recover the detected > > memory errors. - reduces the chance of a correctable error > > becoming uncorrectable. Yah, that's not my question: my question is, how is this new thing, which is exposed to userspace and which then means, this will be supported forever, how is this thing going to be used? And the next question is: is that interface sufficient for those use cases? Are we covering the majority of the usage scenarios? > Just to add one more reason a user space interface is needed. > 5. Policy control for hotplugged memory. There is not necessarily > a system wide bios or similar in the loop to control the scrub > settings on a CXL device that wasn't there at boot. What that > setting should be is a policy decision as we are trading of > reliability vs performance - hence it should be in control of > userspace. > As such, 'an' interface is needed. Seems more sensible to try and > unify it with other similar interfaces than spin yet another one. Yes, I get that: question is, let's say you have that interface. Now what do you do? Do you go and start a scrub cycle by hand? Do you have a script which does that based on some system reports? Do you automate it? I wanna say yes because that's miles better than having to explain yet another set of knobs to users. And so on and so on... I'm trying to get you to imagine the *full* solution and then ask yourselves whether that new interface is adequate. Makes more sense? Thx.
On Mon, 27 May 2024 11:21:31 +0200 Borislav Petkov <bp@alien8.de> wrote: > On Mon, May 20, 2024 at 12:58:57PM +0100, Jonathan Cameron wrote: > > > Following are some of the use cases of generic scrub control > > > subsystem as given in the cover letter. Request please add any > > > other use cases, which I missed. > > > > > > 1. There are several types of interfaces to HW memory scrubbers > > > identified such as ACPI NVDIMM ARS(Address Range Scrub), CXL > > > memory device patrol scrub, CXL DDR5 ECS, ACPI RAS2 memory > > > scrubbing features and software based memory scrubber(discussed > > > in the community Reference [5] in the cover letter). Also some > > > scrubbers support controlling (background) patrol scrubbing(ACPI > > > RAS2, CXL) and/or on-demand scrubbing(ACPI RAS2, ACPI ARS). > > > However the scrub controls varies between memory scrubbers. Thus > > > there is a need for a standard generic ABI and sysfs scrub > > > controls for the userspace tools, which control HW and SW > > > scrubbers in the system, for the easiness of use. > > This is all talking about what hw functionality there is. I'm more > interested in the "there is a need" thing. What need? How? > > In order to support something like this upstream, I'd like to know how > it is going to be used and whether the major use cases are covered. So > that everyone can benefit from it - not only your employer. Fair questions. > > > > 2. Scrub controls in user space allow the user space tool to disable > > > and enable the feature in case disabling of the background patrol > > > scrubbing and changing the scrub rate are needed for other > > > purposes such as performance-aware operations which requires the > > > background operations to be turned off or reduced. > > Who's going to use those scrub controls? Tools? Admins? Scripts? If dealing with disabling, I'd be surprised if it was a normal policy but if it were udev script or boot script. If unusual event (i.e. someone is trying to reduce jitter in a benchmark targetting something else) then interface is simple enough that an admin can poke it directly. > > > > 3. Allows to perform on-demand scrubbing for specific address range > > > if supported by the scrubber. > > > 4. User space tools controls scrub the memory DIMMs regularly at > > > a configurable scrub rate using the sysfs scrub controls > > > discussed help, - to detect uncorrectable memory errors early > > > before user accessing memory, which helps to recover the detected > > > memory errors. - reduces the chance of a correctable error > > > becoming uncorrectable. > > Yah, that's not my question: my question is, how is this new thing, > which is exposed to userspace and which then means, this will be > supported forever, how is this thing going to be used? > > And the next question is: is that interface sufficient for those use > cases? > > Are we covering the majority of the usage scenarios? To a certain extent this is bounded by what the hardware lets us do but agreed we should make sure it 'works' for the usecases we know about. Starting point is some more documentation in the patch set giving common flows (and maybe some example scripts). > > > Just to add one more reason a user space interface is needed. > > 5. Policy control for hotplugged memory. There is not necessarily > > a system wide bios or similar in the loop to control the scrub > > settings on a CXL device that wasn't there at boot. What that > > setting should be is a policy decision as we are trading of > > reliability vs performance - hence it should be in control of > > userspace. > > As such, 'an' interface is needed. Seems more sensible to try and > > unify it with other similar interfaces than spin yet another one. > > Yes, I get that: question is, let's say you have that interface. Now > what do you do? > > Do you go and start a scrub cycle by hand? Typically no, but the option would be there to support an admin who is suspicious or who is trying to gather statistics or similar. > > Do you have a script which does that based on some system reports? > That definitely makes sense for NVDIMM scrub as the model there is to only ever do it on a demand as a single scrub pass. For a cyclic scrub we can spin a policy in rasdaemon or similar to possibly crank up the frequency if we are getting lots of 'non scrub' faults (i.e. correct error reported on demand accesses). Shiju is our expert on this sort of userspace stats monitoring and handling so I'll leave him to come back with a proposal / PoC for doing that. I can see two motivations though: a) Gather better stats on suspect device by ensuring more correctable error detections. b) Increase scrubbing on a device which is on it's way out but not replacable yet for some reason. I would suggest this will be PoC level only for now as it will need a lot of testing on large fleets to do anything sophisticated. > Do you automate it? I wanna say yes because that's miles better than > having to explain yet another set of knobs to users. First instance, I'd expect an UDEV policy so when a new CXL memory turns up we set a default value. A cautious admin would have tweaked that script to set the default to scrub more often, an admin who knows they don't care might turn it off. We can include an example of that in next version I think. > > And so on and so on... > > I'm trying to get you to imagine the *full* solution and then ask > yourselves whether that new interface is adequate. > > Makes more sense? > Absolutely. One area that needs to improve (Dan raised it) is association with HPA ranges so we at can correlate easily error reports with which scrub engine. That can be done with existing version but it's fiddlier than it needs to be. This 'might' be a userspace script example, or maybe making associations tighter in kernel. Jonathan > Thx. >
On Tue, May 28, 2024 at 10:06:45AM +0100, Jonathan Cameron wrote: > If dealing with disabling, I'd be surprised if it was a normal policy but > if it were udev script or boot script. If unusual event (i.e. someone is Yeah, I wouldn't disable it during boot but around my workload only. You want for automatic scrubs to still happen on the system. > trying to reduce jitter in a benchmark targetting something else) then > interface is simple enough that an admin can poke it directly. Right, for benchmarks direct poking is fine. When it is supposed to be something more involved like, dunno, HPC doing a heavy workload and it wants to squeeze all performance so I guess turning off the scrubbers would be part of the setup script. So yeah, if this is properly documented, scripting around it is easy. > To a certain extent this is bounded by what the hardware lets us > do but agreed we should make sure it 'works' for the usecases we know > about. Starting point is some more documentation in the patch set > giving common flows (and maybe some example scripts). Yap, sounds good. As in: "These are the envisioned usages at the time of writing... " or so. > > Do you go and start a scrub cycle by hand? > > Typically no, but the option would be there to support an admin who is > suspicious or who is trying to gather statistics or similar. Ok. > That definitely makes sense for NVDIMM scrub as the model there is > to only ever do it on a demand as a single scrub pass. > For a cyclic scrub we can spin a policy in rasdaemon or similar to > possibly crank up the frequency if we are getting lots of 'non scrub' > faults (i.e. correct error reported on demand accesses). I was going to suggest that: automating stuff with rasdaemon. It would definitely simplify talking to that API. > Shiju is our expert on this sort of userspace stats monitoring and > handling so I'll leave him to come back with a proposal / PoC for doing that. > > I can see two motivations though: > a) Gather better stats on suspect device by ensuring more correctable > error detections. > b) Increase scrubbing on a device which is on it's way out but not replacable > yet for some reason. > > I would suggest this will be PoC level only for now as it will need > a lot of testing on large fleets to do anything sophisticated. Yeah, sounds like a good start. > > Do you automate it? I wanna say yes because that's miles better than > > having to explain yet another set of knobs to users. > > First instance, I'd expect an UDEV policy so when a new CXL memory > turns up we set a default value. A cautious admin would have tweaked > that script to set the default to scrub more often, an admin who > knows they don't care might turn it off. We can include an example of that > in next version I think. Yes, and then hook into rasdaemon the moment it logs an error in some component to go and increase scrubbing of that component. But yeah, you said that above already. > Absolutely. One area that needs to improve (Dan raised it) is > association with HPA ranges so we at can correlate easily error reports > with which scrub engine. That can be done with existing version but > it's fiddlier than it needs to be. This 'might' be a userspace script > example, or maybe making associations tighter in kernel. Right. Thx.
diff --git a/Documentation/ABI/testing/sysfs-class-scrub-configure b/Documentation/ABI/testing/sysfs-class-scrub-configure new file mode 100644 index 000000000000..3ed77dbb00ad --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-scrub-configure @@ -0,0 +1,47 @@ +What: /sys/class/ras/ +Date: March 2024 +KernelVersion: 6.9 +Contact: linux-kernel@vger.kernel.org +Description: + The ras/ class subdirectory belongs to the + common ras features such as scrub subsystem. + +What: /sys/class/ras/rasX/scrub/ +Date: March 2024 +KernelVersion: 6.9 +Contact: linux-kernel@vger.kernel.org +Description: + The /sys/class/ras/ras{0,1,2,3,...}/scrub directories + correspond to each scrub device registered with the + scrub subsystem. + +What: /sys/class/ras/rasX/scrub/name +Date: March 2024 +KernelVersion: 6.9 +Contact: linux-kernel@vger.kernel.org +Description: + (RO) name of the memory scrubber + +What: /sys/class/ras/rasX/scrub/enable_background +Date: March 2024 +KernelVersion: 6.9 +Contact: linux-kernel@vger.kernel.org +Description: + (RW) Enable/Disable background(patrol) scrubbing if supported. + +What: /sys/class/ras/rasX/scrub/rate_available +Date: March 2024 +KernelVersion: 6.9 +Contact: linux-kernel@vger.kernel.org +Description: + (RO) Supported range for the scrub rate by the scrubber. + The scrub rate represents in hours. + +What: /sys/class/ras/rasX/scrub/rate +Date: March 2024 +KernelVersion: 6.9 +Contact: linux-kernel@vger.kernel.org +Description: + (RW) The scrub rate specified and it must be with in the + supported range by the scrubber. + The scrub rate represents in hours. diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index fc4f4bb94a4c..181701479564 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -46,4 +46,11 @@ config RAS_FMPM Memory will be retired during boot time and run time depending on platform-specific policies. +config SCRUB + tristate "Memory scrub driver" + help + This option selects the memory scrub subsystem, supports + configuring the parameters of underlying scrubbers in the + system for the DRAM memories. + endif diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 11f95d59d397..89bcf0d84355 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_RAS) += ras.o obj-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_RAS_CEC) += cec.o +obj-$(CONFIG_SCRUB) += memory_scrub.o obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o obj-y += amd/atl/ diff --git a/drivers/ras/memory_scrub.c b/drivers/ras/memory_scrub.c new file mode 100755 index 000000000000..7e995380ec3a --- /dev/null +++ b/drivers/ras/memory_scrub.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory scrub subsystem supports configuring the registered + * memory scrubbers. + * + * Copyright (c) 2024 HiSilicon Limited. + */ + +#define pr_fmt(fmt) "MEM SCRUB: " fmt + +#include <linux/acpi.h> +#include <linux/bitops.h> +#include <linux/delay.h> +#include <linux/kfifo.h> +#include <linux/memory_scrub.h> +#include <linux/platform_device.h> +#include <linux/spinlock.h> + +/* memory scrubber config definitions */ +#define SCRUB_ID_PREFIX "ras" +#define SCRUB_ID_FORMAT SCRUB_ID_PREFIX "%d" + +static DEFINE_IDA(scrub_ida); + +struct scrub_device { + int id; + struct device dev; + const struct scrub_ops *ops; +}; + +#define to_scrub_device(d) container_of(d, struct scrub_device, dev) +static ssize_t enable_background_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct scrub_device *scrub_dev = to_scrub_device(dev); + bool enable; + int ret; + + ret = kstrtobool(buf, &enable); + if (ret < 0) + return ret; + + ret = scrub_dev->ops->set_enabled_bg(dev, enable); + if (ret) + return ret; + + return len; +} + +static ssize_t enable_background_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct scrub_device *scrub_dev = to_scrub_device(dev); + bool enable; + int ret; + + ret = scrub_dev->ops->get_enabled_bg(dev, &enable); + if (ret) + return ret; + + return sysfs_emit(buf, "%d\n", enable); +} + +static ssize_t name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct scrub_device *scrub_dev = to_scrub_device(dev); + int ret; + + ret = scrub_dev->ops->get_name(dev, buf); + if (ret) + return ret; + + return strlen(buf); +} + +static ssize_t rate_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct scrub_device *scrub_dev = to_scrub_device(dev); + u64 val; + int ret; + + ret = scrub_dev->ops->rate_read(dev, &val); + if (ret) + return ret; + + return sysfs_emit(buf, "0x%llx\n", val); +} + +static ssize_t rate_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct scrub_device *scrub_dev = to_scrub_device(dev); + long val; + int ret; + + ret = kstrtol(buf, 10, &val); + if (ret < 0) + return ret; + + ret = scrub_dev->ops->rate_write(dev, val); + if (ret) + return ret; + + return len; +} + +static ssize_t rate_available_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct scrub_device *scrub_dev = to_scrub_device(dev); + u64 min_sr, max_sr; + int ret; + + ret = scrub_dev->ops->rate_avail_range(dev, &min_sr, &max_sr); + if (ret) + return ret; + + return sysfs_emit(buf, "0x%llx-0x%llx\n", min_sr, max_sr); +} + +DEVICE_ATTR_RW(enable_background); +DEVICE_ATTR_RO(name); +DEVICE_ATTR_RW(rate); +DEVICE_ATTR_RO(rate_available); + +static struct attribute *scrub_attrs[] = { + &dev_attr_enable_background.attr, + &dev_attr_name.attr, + &dev_attr_rate.attr, + &dev_attr_rate_available.attr, + NULL +}; + +static umode_t scrub_attr_visible(struct kobject *kobj, + struct attribute *a, int attr_id) +{ + struct device *dev = kobj_to_dev(kobj); + struct scrub_device *scrub_dev = to_scrub_device(dev); + const struct scrub_ops *ops = scrub_dev->ops; + + if (a == &dev_attr_enable_background.attr) { + if (ops->set_enabled_bg && ops->get_enabled_bg) + return a->mode; + if (ops->get_enabled_bg) + return 0444; + return 0; + } + if (a == &dev_attr_name.attr) + return ops->get_name ? a->mode : 0; + if (a == &dev_attr_rate_available.attr) + return ops->rate_avail_range ? a->mode : 0; + if (a == &dev_attr_rate.attr) { /* Write only makes little sense */ + if (ops->rate_read && ops->rate_write) + return a->mode; + if (ops->rate_read) + return 0444; + return 0; + } + + return 0; +} + +static const struct attribute_group scrub_attr_group = { + .name = "scrub", + .attrs = scrub_attrs, + .is_visible = scrub_attr_visible, +}; + +static const struct attribute_group *scrub_attr_groups[] = { + &scrub_attr_group, + NULL +}; + +static void scrub_dev_release(struct device *dev) +{ + struct scrub_device *scrub_dev = to_scrub_device(dev); + + ida_free(&scrub_ida, scrub_dev->id); + kfree(scrub_dev); +} + +static struct class scrub_class = { + .name = "ras", + .dev_groups = scrub_attr_groups, + .dev_release = scrub_dev_release, +}; + +static struct device * +scrub_device_register(struct device *parent, void *drvdata, + const struct scrub_ops *ops) +{ + struct scrub_device *scrub_dev; + struct device *hdev; + int err; + + scrub_dev = kzalloc(sizeof(*scrub_dev), GFP_KERNEL); + if (!scrub_dev) + return ERR_PTR(-ENOMEM); + hdev = &scrub_dev->dev; + + scrub_dev->id = ida_alloc(&scrub_ida, GFP_KERNEL); + if (scrub_dev->id < 0) { + kfree(scrub_dev); + return ERR_PTR(-ENOMEM); + } + + scrub_dev->ops = ops; + hdev->class = &scrub_class; + hdev->parent = parent; + dev_set_drvdata(hdev, drvdata); + dev_set_name(hdev, SCRUB_ID_FORMAT, scrub_dev->id); + err = device_register(hdev); + if (err) { + put_device(hdev); + return ERR_PTR(err); + } + + return hdev; +} + +static void devm_scrub_release(void *dev) +{ + device_unregister(dev); +} + +/** + * devm_scrub_device_register - register scrubber device + * @dev: the parent device + * @drvdata: driver data to attach to the scrub device + * @ops: pointer to scrub_ops structure (optional) + * + * Returns the pointer to the new device on success, ERR_PTR() otherwise. + * The new device would be automatically unregistered with the parent device. + */ +struct device * +devm_scrub_device_register(struct device *dev, void *drvdata, + const struct scrub_ops *ops) +{ + struct device *hdev; + int ret; + + if (!dev) + return ERR_PTR(-EINVAL); + + hdev = scrub_device_register(dev, drvdata, ops); + if (IS_ERR(hdev)) + return hdev; + + ret = devm_add_action_or_reset(dev, devm_scrub_release, hdev); + if (ret) + return ERR_PTR(ret); + + return hdev; +} +EXPORT_SYMBOL_GPL(devm_scrub_device_register); + +static int __init memory_scrub_control_init(void) +{ + return class_register(&scrub_class); +} +subsys_initcall(memory_scrub_control_init); + +static void memory_scrub_control_exit(void) +{ + class_unregister(&scrub_class); +} +module_exit(memory_scrub_control_exit); diff --git a/include/linux/memory_scrub.h b/include/linux/memory_scrub.h new file mode 100755 index 000000000000..f0e1657a5072 --- /dev/null +++ b/include/linux/memory_scrub.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Memory scrub subsystem driver supports controlling + * the memory scrubbers in the system. + * + * Copyright (c) 2024 HiSilicon Limited. + */ + +#ifndef __MEMORY_SCRUB_H +#define __MEMORY_SCRUB_H + +#include <linux/types.h> + +struct device; + +/** + * struct scrub_ops - scrub device operations (all elements optional) + * @get_enabled_bg: check if currently performing background scrub. + * @set_enabled_bg: start or stop a bg-scrub. + * @get_name: get the memory scrubber name. + * @rate_avail_range: retrieve limits on supported rates. + * @rate_read: read the scrub rate + * @rate_write: set the scrub rate + */ +struct scrub_ops { + int (*get_enabled_bg)(struct device *dev, bool *enable); + int (*set_enabled_bg)(struct device *dev, bool enable); + int (*get_name)(struct device *dev, char *buf); + int (*rate_avail_range)(struct device *dev, u64 *min, u64 *max); + int (*rate_read)(struct device *dev, u64 *rate); + int (*rate_write)(struct device *dev, u64 rate); +}; + +struct device * +devm_scrub_device_register(struct device *dev, void *drvdata, + const struct scrub_ops *ops); +#endif /* __MEMORY_SCRUB_H */