diff mbox series

[v4,2/2] soc: amazon: al-pos-edac: Introduce Amazon's Annapurna Labs POS EDAC driver

Message ID 1570102361-11696-3-git-send-email-talel@amazon.com (mailing list archive)
State New, archived
Headers show
Series Amazon's Annapurna Labs POS Driver | expand

Commit Message

Shenhar, Talel Oct. 3, 2019, 11:32 a.m. UTC
The Amazon's Annapurna Labs SoCs includes Point Of Serialization error
logging unit that reports an error in case write error (e.g . Attempt to
write to a read only register).
This error shall be reported to EDAC subsystem as uncorrectable-error.

Signed-off-by: Talel Shenhar <talel@amazon.com>
---
 MAINTAINERS                |   7 ++
 drivers/edac/Kconfig       |   6 ++
 drivers/edac/Makefile      |   1 +
 drivers/edac/al_pos_edac.c | 173 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 187 insertions(+)
 create mode 100644 drivers/edac/al_pos_edac.c

Comments

Marc Zyngier Oct. 7, 2019, 11:26 a.m. UTC | #1
On Thu, 03 Oct 2019 12:32:41 +0100,
Talel Shenhar <talel@amazon.com> wrote:
> 
> The Amazon's Annapurna Labs SoCs includes Point Of Serialization error
> logging unit that reports an error in case write error (e.g . Attempt to
> write to a read only register).
> This error shall be reported to EDAC subsystem as uncorrectable-error.
> 
> Signed-off-by: Talel Shenhar <talel@amazon.com>
> ---
>  MAINTAINERS                |   7 ++
>  drivers/edac/Kconfig       |   6 ++
>  drivers/edac/Makefile      |   1 +
>  drivers/edac/al_pos_edac.c | 173 +++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 187 insertions(+)
>  create mode 100644 drivers/edac/al_pos_edac.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e7a47b5..f5ce446 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -751,6 +751,13 @@ F:	drivers/tty/serial/altera_jtaguart.c
>  F:	include/linux/altera_uart.h
>  F:	include/linux/altera_jtaguart.h
>  
> +AMAZON ANNAPURNA LABS POS EDAC DRIVER
> +M:	Talel Shenhar <talel@amazon.com>
> +M:	Talel Shenhar <talelshenhar@gmail.com>
> +S:	Maintained
> +F:	Documentation/devicetree/bindings/edac/amazon,al-pos-edac.yaml
> +F:	drivers/edac/al-pos-edac.c
> +
>  AMAZON ANNAPURNA LABS THERMAL MMIO DRIVER
>  M:	Talel Shenhar <talel@amazon.com>
>  S:	Maintained
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index 200c04c..bb5805f 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -100,6 +100,12 @@ config EDAC_AMD64_ERROR_INJECTION
>  	  In addition, there are two control files, inject_read and inject_write,
>  	  which trigger the DRAM ECC Read and Write respectively.
>  
> +config EDAC_AL_POS
> +	tristate "Amazon's Annapurna Labs POS EDAC driver"
> +	depends on (ARCH_ALPINE || COMPILE_TEST)
> +	help
> +	  Include support for the SoC POS EDAC error capability.
> +
>  config EDAC_AMD76X
>  	tristate "AMD 76x (760, 762, 768)"
>  	depends on PCI && X86_32
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index 165ca65e..3571936 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -22,6 +22,7 @@ obj-$(CONFIG_EDAC_GHES)			+= ghes_edac.o
>  edac_mce_amd-y				:= mce_amd.o
>  obj-$(CONFIG_EDAC_DECODE_MCE)		+= edac_mce_amd.o
>  
> +obj-$(CONFIG_EDAC_AL_POS)		+= al_pos_edac.o
>  obj-$(CONFIG_EDAC_AMD76X)		+= amd76x_edac.o
>  obj-$(CONFIG_EDAC_CPC925)		+= cpc925_edac.o
>  obj-$(CONFIG_EDAC_I5000)		+= i5000_edac.o
> diff --git a/drivers/edac/al_pos_edac.c b/drivers/edac/al_pos_edac.c
> new file mode 100644
> index 00000000..bd6cd87
> --- /dev/null
> +++ b/drivers/edac/al_pos_edac.c
> @@ -0,0 +1,173 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
> + */
> +#include <linux/bitfield.h>
> +#include <linux/edac.h>
> +#include <linux/of_irq.h>
> +#include "edac_module.h"
> +
> +#define DRV_NAME "al_pos_edac"
> +#define AL_POS_EDAC_MSG_MAX 256
> +
> +/* Registers Offset */
> +#define AL_POS_ERROR_LOG_1	0x0
> +#define AL_POS_ERROR_LOG_0	0x4
> +
> +/* Registers Fields */
> +#define AL_POS_ERROR_LOG_1_VALID	BIT(31)
> +#define AL_POS_ERROR_LOG_1_BRESP	GENMASK(18, 17)
> +#define AL_POS_ERROR_LOG_1_REQUEST_ID	GENMASK(16, 8)
> +#define AL_POS_ERROR_LOG_1_ADDR_HIGH	GENMASK(7, 0)
> +
> +#define AL_POS_ERROR_LOG_0_ADDR_LOW	GENMASK(31, 0)
> +
> +struct al_pos_edac {
> +	struct edac_device_ctl_info *edac_dev;
> +	void __iomem *mmio_base;
> +	int irq;
> +};
> +
> +static int al_pos_handle(struct al_pos_edac *al_pos)
> +{
> +	u32 log0, log1;
> +	u64 addr;
> +	u16 request_id;
> +	u8 bresp;
> +	char msg[AL_POS_EDAC_MSG_MAX];
> +
> +	log1 = readl(al_pos->mmio_base + AL_POS_ERROR_LOG_1);

I already commented on the misuse of strict accesses. Unless you can
explain and document *why* you need the extra ordering, please use
relaxed accesses.

> +	if (!FIELD_GET(AL_POS_ERROR_LOG_1_VALID, log1))
> +		return 0;
> +
> +	log0 = readl(al_pos->mmio_base + AL_POS_ERROR_LOG_0);
> +	writel(0, al_pos->mmio_base + AL_POS_ERROR_LOG_1);
> +
> +	addr = FIELD_GET(AL_POS_ERROR_LOG_0_ADDR_LOW, log0);
> +	addr |= (((u64)FIELD_GET(AL_POS_ERROR_LOG_1_ADDR_HIGH, log1)) << 32);
> +	request_id = FIELD_GET(AL_POS_ERROR_LOG_1_REQUEST_ID, log1);
> +	bresp = FIELD_GET(AL_POS_ERROR_LOG_1_BRESP, log1);
> +
> +	snprintf(msg, sizeof(msg),
> +		 "addr=0x%llx request_id=0x%x bresp=0x%x\n",
> +		 addr, request_id, bresp);
> +
> +	edac_device_handle_ue(al_pos->edac_dev, 0, 0, msg);
> +
> +	return 1;
> +}
> +
> +static void al_pos_edac_check(struct edac_device_ctl_info *edac_dev)
> +{
> +	struct al_pos_edac *al_pos = edac_dev->pvt_info;
> +
> +	al_pos_handle(al_pos);
> +}
> +
> +static irqreturn_t al_pos_irq_handler(int irq, void *info)
> +{
> +	struct platform_device *pdev = info;
> +	struct al_pos_edac *al_pos = platform_get_drvdata(pdev);
> +
> +	if (al_pos_handle(al_pos))
> +		return IRQ_HANDLED;
> +	return IRQ_NONE;
> +}
> +
> +static int al_pos_probe(struct platform_device *pdev)
> +{
> +	struct edac_device_ctl_info *edac_dev;
> +	struct al_pos_edac *al_pos;
> +	int ret;
> +
> +	edac_dev = edac_device_alloc_ctl_info(sizeof(*al_pos), DRV_NAME, 1,
> +					      DRV_NAME, 1, 0, NULL, 0,
> +					      edac_device_alloc_index());
> +	if (!edac_dev)
> +		return -ENOMEM;
> +
> +	al_pos = edac_dev->pvt_info;
> +	al_pos->edac_dev = edac_dev;
> +	platform_set_drvdata(pdev, al_pos);
> +
> +	al_pos->mmio_base = devm_platform_ioremap_resource(pdev, 0);
> +	if (IS_ERR(al_pos->mmio_base)) {
> +		dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n",
> +			PTR_ERR(al_pos->mmio_base));
> +		return PTR_ERR(al_pos->mmio_base);
> +	}
> +
> +	al_pos->irq = platform_get_irq(pdev, 0);
> +	if (al_pos->irq <= 0)
> +		edac_dev->edac_check = al_pos_edac_check;
> +
> +	edac_dev->dev = &pdev->dev;
> +	edac_dev->mod_name = DRV_NAME;
> +	edac_dev->dev_name = dev_name(&pdev->dev);
> +	edac_dev->ctl_name = "POS";
> +
> +	ret = edac_device_add_device(edac_dev);
> +	if (ret) {
> +		dev_err(&pdev->dev, "Failed to add edac device\n");
> +		goto err_free_edac;
> +	}
> +
> +	if (al_pos->irq > 0) {
> +		ret = devm_request_irq(&pdev->dev,
> +				       al_pos->irq,
> +				       al_pos_irq_handler,
> +				       0,
> +				       pdev->name,
> +				       pdev);
> +		if (ret != 0) {
> +			dev_err(&pdev->dev,
> +				"failed to register to irq %d (%d)\n",
> +				al_pos->irq, ret);
> +			goto err_remove_edac;

Would it be worth continuing without interrupts? After all, the
interrupt seems to be an optional part of the device...

Thanks,

	M.
Shenhar, Talel Oct. 7, 2019, 11:34 a.m. UTC | #2
thanks for the review

On 10/7/2019 2:26 PM, Marc Zyngier wrote:
> On Thu, 03 Oct 2019 12:32:41 +0100,
> Talel Shenhar <talel@amazon.com> wrote:
>> +	log1 = readl(al_pos->mmio_base + AL_POS_ERROR_LOG_1);
> I already commented on the misuse of strict accesses. Unless you can
> explain and document *why* you need the extra ordering, please use
> relaxed accesses.
agreeing on relaxed, shall be part of v5
>
>> +
>> +	if (al_pos->irq > 0) {
>> +		ret = devm_request_irq(&pdev->dev,
>> +				       al_pos->irq,
>> +				       al_pos_irq_handler,
>> +				       0,
>> +				       pdev->name,
>> +				       pdev);
>> +		if (ret != 0) {
>> +			dev_err(&pdev->dev,
>> +				"failed to register to irq %d (%d)\n",
>> +				al_pos->irq, ret);
>> +			goto err_remove_edac;
> Would it be worth continuing without interrupts? After all, the
> interrupt seems to be an optional part of the device...

indeed interrupts are optional, however, this is optional for some of 
the systems.

in some cases (and some systems), this error event is critical and 
should cause fast handling. for those, we define the interrupts.

so bottom line, i would like to keep this error in case of error in 
interrupt.

>
> Thanks,
>
> 	M.
>
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index e7a47b5..f5ce446 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -751,6 +751,13 @@  F:	drivers/tty/serial/altera_jtaguart.c
 F:	include/linux/altera_uart.h
 F:	include/linux/altera_jtaguart.h
 
+AMAZON ANNAPURNA LABS POS EDAC DRIVER
+M:	Talel Shenhar <talel@amazon.com>
+M:	Talel Shenhar <talelshenhar@gmail.com>
+S:	Maintained
+F:	Documentation/devicetree/bindings/edac/amazon,al-pos-edac.yaml
+F:	drivers/edac/al-pos-edac.c
+
 AMAZON ANNAPURNA LABS THERMAL MMIO DRIVER
 M:	Talel Shenhar <talel@amazon.com>
 S:	Maintained
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 200c04c..bb5805f 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -100,6 +100,12 @@  config EDAC_AMD64_ERROR_INJECTION
 	  In addition, there are two control files, inject_read and inject_write,
 	  which trigger the DRAM ECC Read and Write respectively.
 
+config EDAC_AL_POS
+	tristate "Amazon's Annapurna Labs POS EDAC driver"
+	depends on (ARCH_ALPINE || COMPILE_TEST)
+	help
+	  Include support for the SoC POS EDAC error capability.
+
 config EDAC_AMD76X
 	tristate "AMD 76x (760, 762, 768)"
 	depends on PCI && X86_32
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 165ca65e..3571936 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -22,6 +22,7 @@  obj-$(CONFIG_EDAC_GHES)			+= ghes_edac.o
 edac_mce_amd-y				:= mce_amd.o
 obj-$(CONFIG_EDAC_DECODE_MCE)		+= edac_mce_amd.o
 
+obj-$(CONFIG_EDAC_AL_POS)		+= al_pos_edac.o
 obj-$(CONFIG_EDAC_AMD76X)		+= amd76x_edac.o
 obj-$(CONFIG_EDAC_CPC925)		+= cpc925_edac.o
 obj-$(CONFIG_EDAC_I5000)		+= i5000_edac.o
diff --git a/drivers/edac/al_pos_edac.c b/drivers/edac/al_pos_edac.c
new file mode 100644
index 00000000..bd6cd87
--- /dev/null
+++ b/drivers/edac/al_pos_edac.c
@@ -0,0 +1,173 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ */
+#include <linux/bitfield.h>
+#include <linux/edac.h>
+#include <linux/of_irq.h>
+#include "edac_module.h"
+
+#define DRV_NAME "al_pos_edac"
+#define AL_POS_EDAC_MSG_MAX 256
+
+/* Registers Offset */
+#define AL_POS_ERROR_LOG_1	0x0
+#define AL_POS_ERROR_LOG_0	0x4
+
+/* Registers Fields */
+#define AL_POS_ERROR_LOG_1_VALID	BIT(31)
+#define AL_POS_ERROR_LOG_1_BRESP	GENMASK(18, 17)
+#define AL_POS_ERROR_LOG_1_REQUEST_ID	GENMASK(16, 8)
+#define AL_POS_ERROR_LOG_1_ADDR_HIGH	GENMASK(7, 0)
+
+#define AL_POS_ERROR_LOG_0_ADDR_LOW	GENMASK(31, 0)
+
+struct al_pos_edac {
+	struct edac_device_ctl_info *edac_dev;
+	void __iomem *mmio_base;
+	int irq;
+};
+
+static int al_pos_handle(struct al_pos_edac *al_pos)
+{
+	u32 log0, log1;
+	u64 addr;
+	u16 request_id;
+	u8 bresp;
+	char msg[AL_POS_EDAC_MSG_MAX];
+
+	log1 = readl(al_pos->mmio_base + AL_POS_ERROR_LOG_1);
+	if (!FIELD_GET(AL_POS_ERROR_LOG_1_VALID, log1))
+		return 0;
+
+	log0 = readl(al_pos->mmio_base + AL_POS_ERROR_LOG_0);
+	writel(0, al_pos->mmio_base + AL_POS_ERROR_LOG_1);
+
+	addr = FIELD_GET(AL_POS_ERROR_LOG_0_ADDR_LOW, log0);
+	addr |= (((u64)FIELD_GET(AL_POS_ERROR_LOG_1_ADDR_HIGH, log1)) << 32);
+	request_id = FIELD_GET(AL_POS_ERROR_LOG_1_REQUEST_ID, log1);
+	bresp = FIELD_GET(AL_POS_ERROR_LOG_1_BRESP, log1);
+
+	snprintf(msg, sizeof(msg),
+		 "addr=0x%llx request_id=0x%x bresp=0x%x\n",
+		 addr, request_id, bresp);
+
+	edac_device_handle_ue(al_pos->edac_dev, 0, 0, msg);
+
+	return 1;
+}
+
+static void al_pos_edac_check(struct edac_device_ctl_info *edac_dev)
+{
+	struct al_pos_edac *al_pos = edac_dev->pvt_info;
+
+	al_pos_handle(al_pos);
+}
+
+static irqreturn_t al_pos_irq_handler(int irq, void *info)
+{
+	struct platform_device *pdev = info;
+	struct al_pos_edac *al_pos = platform_get_drvdata(pdev);
+
+	if (al_pos_handle(al_pos))
+		return IRQ_HANDLED;
+	return IRQ_NONE;
+}
+
+static int al_pos_probe(struct platform_device *pdev)
+{
+	struct edac_device_ctl_info *edac_dev;
+	struct al_pos_edac *al_pos;
+	int ret;
+
+	edac_dev = edac_device_alloc_ctl_info(sizeof(*al_pos), DRV_NAME, 1,
+					      DRV_NAME, 1, 0, NULL, 0,
+					      edac_device_alloc_index());
+	if (!edac_dev)
+		return -ENOMEM;
+
+	al_pos = edac_dev->pvt_info;
+	al_pos->edac_dev = edac_dev;
+	platform_set_drvdata(pdev, al_pos);
+
+	al_pos->mmio_base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(al_pos->mmio_base)) {
+		dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n",
+			PTR_ERR(al_pos->mmio_base));
+		return PTR_ERR(al_pos->mmio_base);
+	}
+
+	al_pos->irq = platform_get_irq(pdev, 0);
+	if (al_pos->irq <= 0)
+		edac_dev->edac_check = al_pos_edac_check;
+
+	edac_dev->dev = &pdev->dev;
+	edac_dev->mod_name = DRV_NAME;
+	edac_dev->dev_name = dev_name(&pdev->dev);
+	edac_dev->ctl_name = "POS";
+
+	ret = edac_device_add_device(edac_dev);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to add edac device\n");
+		goto err_free_edac;
+	}
+
+	if (al_pos->irq > 0) {
+		ret = devm_request_irq(&pdev->dev,
+				       al_pos->irq,
+				       al_pos_irq_handler,
+				       0,
+				       pdev->name,
+				       pdev);
+		if (ret != 0) {
+			dev_err(&pdev->dev,
+				"failed to register to irq %d (%d)\n",
+				al_pos->irq, ret);
+			goto err_remove_edac;
+		}
+	}
+
+	return 0;
+
+err_remove_edac:
+	edac_device_del_device(edac_dev->dev);
+err_free_edac:
+	edac_device_free_ctl_info(edac_dev);
+
+	return ret;
+}
+
+static int al_pos_remove(struct platform_device *pdev)
+{
+	struct al_pos_edac *al_pos = platform_get_drvdata(pdev);
+
+	if (al_pos->irq > 0)
+		devm_free_irq(&pdev->dev, al_pos->irq, pdev);
+
+	edac_device_del_device(al_pos->edac_dev->dev);
+	edac_device_free_ctl_info(al_pos->edac_dev);
+
+	return 0;
+}
+
+static const struct of_device_id al_pos_of_match[] = {
+	{ .compatible = "amazon,al-pos-edac", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, al_pos_of_match);
+
+static struct platform_driver al_pos_driver = {
+	.probe = al_pos_probe,
+	.remove = al_pos_remove,
+	.driver = {
+		.name = DRV_NAME,
+		.of_match_table = al_pos_of_match,
+	},
+};
+
+module_platform_driver(al_pos_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Talel Shenhar");
+MODULE_DESCRIPTION("Amazon's Annapurna Labs POS driver");