diff mbox series

[2/2] EDAC/igen6: Add polling support

Message ID 20241101204211.414664-3-orange@aiven.io (mailing list archive)
State New
Headers show
Series EDAC/igen6: Avoid segmentation fault and add polling support | expand

Commit Message

Orange Kao Nov. 1, 2024, 8:41 p.m. UTC
I have a PC with Intel N100 (with PCI device 8086:461c, DID_ADL_N_SKU4)
but interrupt does not seems to work, even with the following
configuration in BIOS. I am not sure if this is caused by a BIOS bug or
not.

    In-Band ECC Support: Enabled
    In-Band ECC Operation Mode: 2 (make all requests protected and
                                  ignore range checks)
    IBECC Error Injection Control: Inject Correctable Error on insertion
                                   counter
    Error Injaction Insertion Count: 251658240 (0xf000000)

This commit tried to add polling support.
---
 drivers/edac/igen6_edac.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

Comments

Zhuo, Qiuxu Nov. 2, 2024, 10:24 a.m. UTC | #1
Hi Orange,

There are two things to do:

  1) Set ' edac_op_state' correctly (preparation work).
       This is provided as a separate patch in [1].

  2) Add polling mode support (your current patch).
       Please incorporate [2] into your current patch and update
       your current patch on top of [1] with the comments below.

Eventually, there will be 3 patches in your series. Could you please
re-send all these 3 patches after you resolve the comments below?

If any questions, feel free to let me know. Thanks!

TIP: the command below can quickly provide you with useful comments for your patch. 

    ${LINUX}/scripts/checkpatch.pl --strict   <your patch>

> From: Orange Kao <orange@aiven.io>
> [...]
> 
> I have a PC with Intel N100 (with PCI device 8086:461c, DID_ADL_N_SKU4) but
> interrupt does not seems to work, even with the following configuration in
> BIOS. I am not sure if this is caused by a BIOS bug or not.

Tweak this commit message a bit as below:

  Some PCs with Intel N100 (with PCI device 8086:461c, DID_ADL_N_SKU4)
  experienced issues with error interrupts not working, even with the following 
  configuration in the BIOS.

> 
>     In-Band ECC Support: Enabled
>     In-Band ECC Operation Mode: 2 (make all requests protected and
>                                   ignore range checks)
>     IBECC Error Injection Control: Inject Correctable Error on insertion
>                                    counter
>     Error Injaction Insertion Count: 251658240 (0xf000000)

s/Injaction/Injection

> 
> This commit tried to add polling support.

Tweak this sentence a bit as below:

  Add polling mode support for these machines to ensure that memory error
  events are handled.

Also append your "Signed-off-by:" tag here.

> ---
>  drivers/edac/igen6_edac.c | 23 ++++++++++++++++++++++-
>  1 file changed, 22 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index
> 07dacf8c10be..5027070410a5 100644
> --- a/drivers/edac/igen6_edac.c
> +++ b/drivers/edac/igen6_edac.c
> @@ -1170,6 +1170,19 @@ static int igen6_pci_setup(struct pci_dev *pdev,
> u64 *mchbar)
>  	return -ENODEV;
>  }
> 
> +static void igen6_check(struct mem_ctl_info *mci) {
> +	struct igen6_imc *imc = mci->pvt_info;
> +
> +	/* errsts_clear() isn't NMI-safe. Delay it in the IRQ context */
> +	u64 ecclog = ecclog_read_and_clear(imc);
> +	if (!ecclog)
> +		return;

Add a blank line here.

> +	if (!ecclog_gen_pool_add(imc->mc, ecclog))
> +		irq_work_queue(&ecclog_irq_work);
> +
> +}
> +
>  static int igen6_register_mci(int mc, u64 mchbar, struct pci_dev *pdev)  {
>  	struct edac_mc_layer layers[2];
> @@ -1211,6 +1224,9 @@ static int igen6_register_mci(int mc, u64 mchbar,
> struct pci_dev *pdev)
>  	mci->edac_cap = EDAC_FLAG_SECDED;
>  	mci->mod_name = EDAC_MOD_STR;
>  	mci->dev_name = pci_name(pdev);
> +	if (edac_op_state == EDAC_OPSTATE_POLL) {
> +		mci->edac_check = igen6_check;
> +	}
>  	mci->pvt_info = &igen6_pvt->imc[mc];
> 
>  	imc = mci->pvt_info;
> @@ -1450,7 +1466,9 @@ static int __init igen6_init(void)
>  	if (owner && strncmp(owner, EDAC_MOD_STR,
> sizeof(EDAC_MOD_STR)))
>  		return -EBUSY;
> 
> -	edac_op_state = EDAC_OPSTATE_NMI;
> +	if (edac_op_state == EDAC_OPSTATE_INVAL) {
> +		edac_op_state = EDAC_OPSTATE_NMI;
> +	}

These 4 diffs are not needed after the separate patch [1].

>  	rc = pci_register_driver(&igen6_driver);
>  	if (rc)
> @@ -1474,3 +1492,6 @@ module_exit(igen6_exit);  MODULE_LICENSE("GPL
> v2");  MODULE_AUTHOR("Qiuxu Zhuo");  MODULE_DESCRIPTION("MC Driver
> for Intel client SoC using In-Band ECC");
> +
> +module_param(edac_op_state, int, 0444);
> MODULE_PARM_DESC(edac_op_state,
> +"EDAC Error Reporting state: 0=Poll,1=NMI. Default=1");

The driver now supports polling mode or detects it based on the configuration mode. 
Please replace this line with the one in [2] accordingly. 

> --
> 2.47.0

=======

[1]
From 182c717f9f46a9dc0711a8d9a1c42be6deb2cdb3 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Sat, 2 Nov 2024 16:15:09 +0800
Subject: [PATCH 1/1] EDAC/igen6: Initialize edac_op_state according to the
 configuration data

Currently, igen6_edac sets edac_op_state to EDAC_OPSTATE_NMI, while the
driver also supports memory errors reported from Machine Check. Initialize
edac_op_state to the correct value according to the configuration data
that the driver probed.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
---
 drivers/edac/igen6_edac.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c
index 189a2fc29e74..b6ac14b69358 100644
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -1348,6 +1348,15 @@ static void unregister_err_handler(void)
        unregister_nmi_handler(NMI_SERR, IGEN6_NMI_NAME);
 }

+static void opstate_set(struct res_config *cfg)
+{
+       /* Set the mode according to the configuration data. */
+       if (cfg->machine_check)
+               edac_op_state = EDAC_OPSTATE_INT;
+       else
+               edac_op_state = EDAC_OPSTATE_NMI;
+}
+
 static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
        u64 mchbar;
@@ -1365,6 +1374,8 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (rc)
                goto fail;

+       opstate_set(res_cfg);
+
        for (i = 0; i < res_cfg->num_imc; i++) {
                rc = igen6_register_mci(i, mchbar, pdev);
                if (rc)
@@ -1448,8 +1459,6 @@ static int __init igen6_init(void)
        if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
                return -EBUSY;

-       edac_op_state = EDAC_OPSTATE_NMI;
-
        rc = pci_register_driver(&igen6_driver);
        if (rc)
                return rc;
--
2.17.1


[2]
diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c
index 4f38bb1c1fc3..c8b6e41a12b5 100644
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -1368,6 +1368,10 @@ static void unregister_err_handler(void)

 static void opstate_set(struct res_config *cfg)
 {
+       /* Only the polling mode can be set via the module parameter. */
+       if (edac_op_state == EDAC_OPSTATE_POLL)
+               return;
+
        /* Set the mode according to the configuration data. */
        if (cfg->machine_check)
                edac_op_state = EDAC_OPSTATE_INT;
@@ -1505,4 +1509,4 @@ MODULE_AUTHOR("Qiuxu Zhuo");
 MODULE_DESCRIPTION("MC Driver for Intel client SoC using In-Band ECC");

 module_param(edac_op_state, int, 0444);
-MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI. Default=1");
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll, Others or default=Auto detect");
diff mbox series

Patch

diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c
index 07dacf8c10be..5027070410a5 100644
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -1170,6 +1170,19 @@  static int igen6_pci_setup(struct pci_dev *pdev, u64 *mchbar)
 	return -ENODEV;
 }
 
+static void igen6_check(struct mem_ctl_info *mci)
+{
+	struct igen6_imc *imc = mci->pvt_info;
+
+	/* errsts_clear() isn't NMI-safe. Delay it in the IRQ context */
+	u64 ecclog = ecclog_read_and_clear(imc);
+	if (!ecclog)
+		return;
+	if (!ecclog_gen_pool_add(imc->mc, ecclog))
+		irq_work_queue(&ecclog_irq_work);
+
+}
+
 static int igen6_register_mci(int mc, u64 mchbar, struct pci_dev *pdev)
 {
 	struct edac_mc_layer layers[2];
@@ -1211,6 +1224,9 @@  static int igen6_register_mci(int mc, u64 mchbar, struct pci_dev *pdev)
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = EDAC_MOD_STR;
 	mci->dev_name = pci_name(pdev);
+	if (edac_op_state == EDAC_OPSTATE_POLL) {
+		mci->edac_check = igen6_check;
+	}
 	mci->pvt_info = &igen6_pvt->imc[mc];
 
 	imc = mci->pvt_info;
@@ -1450,7 +1466,9 @@  static int __init igen6_init(void)
 	if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
 		return -EBUSY;
 
-	edac_op_state = EDAC_OPSTATE_NMI;
+	if (edac_op_state == EDAC_OPSTATE_INVAL) {
+		edac_op_state = EDAC_OPSTATE_NMI;
+	}
 
 	rc = pci_register_driver(&igen6_driver);
 	if (rc)
@@ -1474,3 +1492,6 @@  module_exit(igen6_exit);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Qiuxu Zhuo");
 MODULE_DESCRIPTION("MC Driver for Intel client SoC using In-Band ECC");
+
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI. Default=1");