diff mbox series

[v5,2/3] ACPI, APEI, EINJ: Add CXL 1.1 EINJ error type support

Message ID 20230925200127.504256-3-Benjamin.Cheatham@amd.com (mailing list archive)
State Superseded, archived
Headers show
Series CXL, ACPI, APEI, EINJ: Update EINJ for CXL 1.1 error types | expand

Commit Message

Ben Cheatham Sept. 25, 2023, 8:01 p.m. UTC
Add support for CXL EINJ error types for CXL 1.1 hosts added in ACPI
v6.5. Because these error types target memory-mapped CXL 1.1 compliant
downstream ports and not physical (normal/persistent) memory, these
error types are not currently  allowed through the memory range
validation done by the EINJ driver.

The MMIO address of a CXL 1.1 downstream port can be found in the
cxl_rcrb_addr file in the corresponding dport directory under
/sys/bus/cxl/devices/portX. CXL 1.1 error types follow the same
procedure as a memory error type, but with param1 set to the
downstream port MMIO address.

Example usage:
$ cd /sys/kernel/debug/apei/einj
$ cat available_error_type
    0x00000008      Memory Correctable
    0x00000010      Memory Uncorrectable non-fatal
    0x00000020      Memory Uncorrectable fatal
    0x00000040      PCI Express Correctable
    0x00000080      PCI Express Uncorrectable non-fatal
    0x00000100      PCI Express Uncorrectable fatal
    0x00008000      CXL.mem Protocol Correctable
    0x00020000      CXL.mem Protocol Uncorrectable fatal
$ echo 0x8000 > error_type
$ echo 0xfffffffffffff000 > param2
$ echo 0x2 > flags
$ cat /sys/bus/cxl/devices/portX/dportY/cxl_rcrb_addr
0xb2f00000
$ echo 0xb2f00000 > param1
$ echo 1 > error_inject

Signed-off-by: Ben Cheatham <Benjamin.Cheatham@amd.com>
---
 drivers/acpi/apei/Kconfig |  2 ++
 drivers/acpi/apei/einj.c  | 24 +++++++++++++++++++++++-
 drivers/cxl/acpi.c        |  1 +
 drivers/cxl/core/port.c   | 17 +++++++++++++++++
 drivers/cxl/cxl.h         |  1 +
 include/linux/cxl.h       | 11 +++++++++++
 6 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/cxl.h

Comments

Jonathan Cameron Sept. 26, 2023, 11:04 a.m. UTC | #1
On Mon, 25 Sep 2023 15:01:26 -0500
Ben Cheatham <Benjamin.Cheatham@amd.com> wrote:

> Add support for CXL EINJ error types for CXL 1.1 hosts added in ACPI
> v6.5. Because these error types target memory-mapped CXL 1.1 compliant
> downstream ports and not physical (normal/persistent) memory, these
> error types are not currently  allowed through the memory range
> validation done by the EINJ driver.
> 
> The MMIO address of a CXL 1.1 downstream port can be found in the
> cxl_rcrb_addr file in the corresponding dport directory under
> /sys/bus/cxl/devices/portX. CXL 1.1 error types follow the same
> procedure as a memory error type, but with param1 set to the
> downstream port MMIO address.
> 
> Example usage:
> $ cd /sys/kernel/debug/apei/einj
> $ cat available_error_type
>     0x00000008      Memory Correctable
>     0x00000010      Memory Uncorrectable non-fatal
>     0x00000020      Memory Uncorrectable fatal
>     0x00000040      PCI Express Correctable
>     0x00000080      PCI Express Uncorrectable non-fatal
>     0x00000100      PCI Express Uncorrectable fatal
>     0x00008000      CXL.mem Protocol Correctable
>     0x00020000      CXL.mem Protocol Uncorrectable fatal
> $ echo 0x8000 > error_type
> $ echo 0xfffffffffffff000 > param2
> $ echo 0x2 > flags
> $ cat /sys/bus/cxl/devices/portX/dportY/cxl_rcrb_addr
> 0xb2f00000
> $ echo 0xb2f00000 > param1
> $ echo 1 > error_inject
> 
> Signed-off-by: Ben Cheatham <Benjamin.Cheatham@amd.com>
Hi Ben

A few trivial things inline.

Jonathan

> ---
>  drivers/acpi/apei/Kconfig |  2 ++
>  drivers/acpi/apei/einj.c  | 24 +++++++++++++++++++++++-
>  drivers/cxl/acpi.c        |  1 +
>  drivers/cxl/core/port.c   | 17 +++++++++++++++++
>  drivers/cxl/cxl.h         |  1 +
>  include/linux/cxl.h       | 11 +++++++++++
>  6 files changed, 55 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/cxl.h
> 
> diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
> index 6b18f8bc7be3..eb9cc7157342 100644
> --- a/drivers/acpi/apei/Kconfig
> +++ b/drivers/acpi/apei/Kconfig
> @@ -55,6 +55,8 @@ config ACPI_APEI_MEMORY_FAILURE
>  config ACPI_APEI_EINJ
>  	tristate "APEI Error INJection (EINJ)"
>  	depends on ACPI_APEI && DEBUG_FS
> +	imply CXL_BUS
> +	imply CXL_ACPI
>  	help
>  	  EINJ provides a hardware error injection mechanism, it is
>  	  mainly used for debugging and testing the other parts of
> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
> index 013eb621dc92..8000417a5f26 100644
> --- a/drivers/acpi/apei/einj.c
> +++ b/drivers/acpi/apei/einj.c
> @@ -21,6 +21,7 @@
>  #include <linux/nmi.h>
>  #include <linux/delay.h>
>  #include <linux/mm.h>
> +#include <linux/cxl.h>
>  #include <asm/unaligned.h>
>  
>  #include "apei-internal.h"
> @@ -36,6 +37,7 @@
>  #define MEM_ERROR_MASK		(ACPI_EINJ_MEMORY_CORRECTABLE | \
>  				ACPI_EINJ_MEMORY_UNCORRECTABLE | \
>  				ACPI_EINJ_MEMORY_FATAL)
> +#define CXL_ERROR_MASK		GENMASK(17, 12)
>  
>  /*
>   * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action.
> @@ -512,6 +514,22 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>  	return rc;
>  }
>  
> +static int is_valid_cxl_addr(u64 addr)
> +{
> +	struct cxl_dport *dport;
> +
> +	if (IS_REACHABLE(CONFIG_CXL_ACPI))
> +		dport = cxl_find_rch_dport_by_rcrb((resource_size_t) addr);
> +	else
> +		return 0;
Flip this logic probably so the quick exit is the indented path.

	if (!IS_REACHABLE(CONFIG_CXL_ACPI))
		return 0

	dport = ...

> +
> +	if (!IS_ERR_OR_NULL(dport))
> +		return 1;
> +
> +	pr_info("Could not find dport with rcrb 0x%llx\n", addr);
> +	return 0;
> +}
> +
>  /* Inject the specified hardware error */
>  static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>  			     u64 param3, u64 param4)
> @@ -537,8 +555,11 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>  	if (type & ACPI5_VENDOR_BIT) {
>  		if (vendor_flags != SETWA_FLAGS_MEM)
>  			goto inject;
> -	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM))
> +	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM)) {
>  		goto inject;
> +	} else if (type & CXL_ERROR_MASK && is_valid_cxl_addr(param1)) {
> +		goto inject;
> +	}
>  
>  	/*
>  	 * Disallow crazy address masks that give BIOS leeway to pick
> @@ -807,3 +828,4 @@ module_exit(einj_exit);
>  MODULE_AUTHOR("Huang Ying");
>  MODULE_DESCRIPTION("APEI Error INJection support");
>  MODULE_LICENSE("GPL");
> +MODULE_IMPORT_NS(CXL);
> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
> index 3e2ca946bf47..a7adf3d9814e 100644
> --- a/drivers/cxl/acpi.c
> +++ b/drivers/cxl/acpi.c
> @@ -756,6 +756,7 @@ static void __exit cxl_acpi_exit(void)
>  {
>  	platform_driver_unregister(&cxl_acpi_driver);
>  	cxl_bus_drain();
> +	set_cxl_root(NULL);

Why this patch rather than previous?

>  }
>  
>  /* load before dax_hmem sees 'Soft Reserved' CXL ranges */
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index c3914e73f67e..cb653cb1a1ea 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -1147,6 +1147,23 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
>  }
>  EXPORT_SYMBOL_NS_GPL(devm_cxl_add_rch_dport, CXL);
>  
> +struct cxl_dport *cxl_find_rch_dport_by_rcrb(resource_size_t rcrb_base)
> +{
> +	struct cxl_dport *dport;
> +	unsigned long index;
> +
> +	if (!cxl_root)
> +		return ERR_PTR(-ENODEV);
> +
> +	xa_for_each(&cxl_root->dports, index, dport)
> +		if ((dport->rch && dport->rcrb.base != CXL_RESOURCE_NONE)
> +		    && dport->rcrb.base == rcrb_base)

Local style in this file has && at end of line above.

> +			return dport;
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_find_rch_dport_by_rcrb, CXL);
> +
>  static int add_ep(struct cxl_ep *new)
>  {
>  	struct cxl_port *port = new->dport->port;
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 4d5bce4bae7e..3e6779dbcd23 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -8,6 +8,7 @@
>  #include <linux/bitfield.h>
>  #include <linux/bitops.h>
>  #include <linux/log2.h>
> +#include <linux/cxl.h>
>  #include <linux/io.h>
>  
>  /**
> diff --git a/include/linux/cxl.h b/include/linux/cxl.h
> new file mode 100644
> index 000000000000..6702caa78e7a
> --- /dev/null
> +++ b/include/linux/cxl.h

We have other code outside drivers/cxl that gets to the
CXL headers directly (to avoid this include being created before now).

https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/perf/cxl_pmu.c#L24

There wasn't much enthusiasm for creating a globally visible header hence
I did that ../cxl stuff for that.

Same probably applies here.

Jonathan

> @@ -0,0 +1,11 @@
> +#ifndef _LINUX_CXL_H
> +#define _LINUX_CXL_H
> +
> +#include <linux/xarray.h>
> +#include <linux/errno.h>
> +
> +struct cxl_dport;
> +
> +struct cxl_dport *cxl_find_rch_dport_by_rcrb(resource_size_t rcrb_base);
> +
> +#endif
Ben Cheatham Sept. 26, 2023, 4 p.m. UTC | #2
On 9/26/23 6:04 AM, Jonathan Cameron wrote:
> On Mon, 25 Sep 2023 15:01:26 -0500
> Ben Cheatham <Benjamin.Cheatham@amd.com> wrote:
> 
>> Add support for CXL EINJ error types for CXL 1.1 hosts added in ACPI
>> v6.5. Because these error types target memory-mapped CXL 1.1 compliant
>> downstream ports and not physical (normal/persistent) memory, these
>> error types are not currently  allowed through the memory range
>> validation done by the EINJ driver.
>>
>> The MMIO address of a CXL 1.1 downstream port can be found in the
>> cxl_rcrb_addr file in the corresponding dport directory under
>> /sys/bus/cxl/devices/portX. CXL 1.1 error types follow the same
>> procedure as a memory error type, but with param1 set to the
>> downstream port MMIO address.
>>
>> Example usage:
>> $ cd /sys/kernel/debug/apei/einj
>> $ cat available_error_type
>>     0x00000008      Memory Correctable
>>     0x00000010      Memory Uncorrectable non-fatal
>>     0x00000020      Memory Uncorrectable fatal
>>     0x00000040      PCI Express Correctable
>>     0x00000080      PCI Express Uncorrectable non-fatal
>>     0x00000100      PCI Express Uncorrectable fatal
>>     0x00008000      CXL.mem Protocol Correctable
>>     0x00020000      CXL.mem Protocol Uncorrectable fatal
>> $ echo 0x8000 > error_type
>> $ echo 0xfffffffffffff000 > param2
>> $ echo 0x2 > flags
>> $ cat /sys/bus/cxl/devices/portX/dportY/cxl_rcrb_addr
>> 0xb2f00000
>> $ echo 0xb2f00000 > param1
>> $ echo 1 > error_inject
>>
>> Signed-off-by: Ben Cheatham <Benjamin.Cheatham@amd.com>
> Hi Ben
> 
> A few trivial things inline.
> 
> Jonathan
> 
>> ---
>>  drivers/acpi/apei/Kconfig |  2 ++
>>  drivers/acpi/apei/einj.c  | 24 +++++++++++++++++++++++-
>>  drivers/cxl/acpi.c        |  1 +
>>  drivers/cxl/core/port.c   | 17 +++++++++++++++++
>>  drivers/cxl/cxl.h         |  1 +
>>  include/linux/cxl.h       | 11 +++++++++++
>>  6 files changed, 55 insertions(+), 1 deletion(-)
>>  create mode 100644 include/linux/cxl.h
>>
>> diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
>> index 6b18f8bc7be3..eb9cc7157342 100644
>> --- a/drivers/acpi/apei/Kconfig
>> +++ b/drivers/acpi/apei/Kconfig
>> @@ -55,6 +55,8 @@ config ACPI_APEI_MEMORY_FAILURE
>>  config ACPI_APEI_EINJ
>>  	tristate "APEI Error INJection (EINJ)"
>>  	depends on ACPI_APEI && DEBUG_FS
>> +	imply CXL_BUS
>> +	imply CXL_ACPI
>>  	help
>>  	  EINJ provides a hardware error injection mechanism, it is
>>  	  mainly used for debugging and testing the other parts of
>> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
>> index 013eb621dc92..8000417a5f26 100644
>> --- a/drivers/acpi/apei/einj.c
>> +++ b/drivers/acpi/apei/einj.c
>> @@ -21,6 +21,7 @@
>>  #include <linux/nmi.h>
>>  #include <linux/delay.h>
>>  #include <linux/mm.h>
>> +#include <linux/cxl.h>
>>  #include <asm/unaligned.h>
>>  
>>  #include "apei-internal.h"
>> @@ -36,6 +37,7 @@
>>  #define MEM_ERROR_MASK		(ACPI_EINJ_MEMORY_CORRECTABLE | \
>>  				ACPI_EINJ_MEMORY_UNCORRECTABLE | \
>>  				ACPI_EINJ_MEMORY_FATAL)
>> +#define CXL_ERROR_MASK		GENMASK(17, 12)
>>  
>>  /*
>>   * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action.
>> @@ -512,6 +514,22 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>  	return rc;
>>  }
>>  
>> +static int is_valid_cxl_addr(u64 addr)
>> +{
>> +	struct cxl_dport *dport;
>> +
>> +	if (IS_REACHABLE(CONFIG_CXL_ACPI))
>> +		dport = cxl_find_rch_dport_by_rcrb((resource_size_t) addr);
>> +	else
>> +		return 0;
> Flip this logic probably so the quick exit is the indented path.
> 
> 	if (!IS_REACHABLE(CONFIG_CXL_ACPI))
> 		return 0
> 
> 	dport = ...

Will do.

> 
>> +
>> +	if (!IS_ERR_OR_NULL(dport))
>> +		return 1;
>> +
>> +	pr_info("Could not find dport with rcrb 0x%llx\n", addr);
>> +	return 0;
>> +}
>> +
>>  /* Inject the specified hardware error */
>>  static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>  			     u64 param3, u64 param4)
>> @@ -537,8 +555,11 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
>>  	if (type & ACPI5_VENDOR_BIT) {
>>  		if (vendor_flags != SETWA_FLAGS_MEM)
>>  			goto inject;
>> -	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM))
>> +	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM)) {
>>  		goto inject;
>> +	} else if (type & CXL_ERROR_MASK && is_valid_cxl_addr(param1)) {
>> +		goto inject;
>> +	}
>>  
>>  	/*
>>  	 * Disallow crazy address masks that give BIOS leeway to pick
>> @@ -807,3 +828,4 @@ module_exit(einj_exit);
>>  MODULE_AUTHOR("Huang Ying");
>>  MODULE_DESCRIPTION("APEI Error INJection support");
>>  MODULE_LICENSE("GPL");
>> +MODULE_IMPORT_NS(CXL);
>> diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
>> index 3e2ca946bf47..a7adf3d9814e 100644
>> --- a/drivers/cxl/acpi.c
>> +++ b/drivers/cxl/acpi.c
>> @@ -756,6 +756,7 @@ static void __exit cxl_acpi_exit(void)
>>  {
>>  	platform_driver_unregister(&cxl_acpi_driver);
>>  	cxl_bus_drain();
>> +	set_cxl_root(NULL);
> 
> Why this patch rather than previous?
> 

Good point. I added this a bit late and seem to have accidentally
put it in the wrong patch. I'll move it into the previous.

>>  }
>>  
>>  /* load before dax_hmem sees 'Soft Reserved' CXL ranges */
>> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
>> index c3914e73f67e..cb653cb1a1ea 100644
>> --- a/drivers/cxl/core/port.c
>> +++ b/drivers/cxl/core/port.c
>> @@ -1147,6 +1147,23 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
>>  }
>>  EXPORT_SYMBOL_NS_GPL(devm_cxl_add_rch_dport, CXL);
>>  
>> +struct cxl_dport *cxl_find_rch_dport_by_rcrb(resource_size_t rcrb_base)
>> +{
>> +	struct cxl_dport *dport;
>> +	unsigned long index;
>> +
>> +	if (!cxl_root)
>> +		return ERR_PTR(-ENODEV);
>> +
>> +	xa_for_each(&cxl_root->dports, index, dport)
>> +		if ((dport->rch && dport->rcrb.base != CXL_RESOURCE_NONE)
>> +		    && dport->rcrb.base == rcrb_base)
> 
> Local style in this file has && at end of line above.
> 

I'll fix that, thanks for pointing it out.

>> +			return dport;
>> +
>> +	return NULL;
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_find_rch_dport_by_rcrb, CXL);
>> +
>>  static int add_ep(struct cxl_ep *new)
>>  {
>>  	struct cxl_port *port = new->dport->port;
>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>> index 4d5bce4bae7e..3e6779dbcd23 100644
>> --- a/drivers/cxl/cxl.h
>> +++ b/drivers/cxl/cxl.h
>> @@ -8,6 +8,7 @@
>>  #include <linux/bitfield.h>
>>  #include <linux/bitops.h>
>>  #include <linux/log2.h>
>> +#include <linux/cxl.h>
>>  #include <linux/io.h>
>>  
>>  /**
>> diff --git a/include/linux/cxl.h b/include/linux/cxl.h
>> new file mode 100644
>> index 000000000000..6702caa78e7a
>> --- /dev/null
>> +++ b/include/linux/cxl.h
> 
> We have other code outside drivers/cxl that gets to the
> CXL headers directly (to avoid this include being created before now).
> 
> https://elixir.bootlin.com/linux/v6.6-rc3/source/drivers/perf/cxl_pmu.c#L24
> 
> There wasn't much enthusiasm for creating a globally visible header hence
> I did that ../cxl stuff for that.
> 
> Same probably applies here.
> 

I'll look into using the private header. Thanks!

> Jonathan
> 
>> @@ -0,0 +1,11 @@
>> +#ifndef _LINUX_CXL_H
>> +#define _LINUX_CXL_H
>> +
>> +#include <linux/xarray.h>
>> +#include <linux/errno.h>
>> +
>> +struct cxl_dport;
>> +
>> +struct cxl_dport *cxl_find_rch_dport_by_rcrb(resource_size_t rcrb_base);
>> +
>> +#endif
>
Bjorn Helgaas Sept. 26, 2023, 8:22 p.m. UTC | #3
On Mon, Sep 25, 2023 at 03:01:26PM -0500, Ben Cheatham wrote:
> Add support for CXL EINJ error types for CXL 1.1 hosts added in ACPI
> v6.5. Because these error types target memory-mapped CXL 1.1 compliant
> downstream ports and not physical (normal/persistent) memory, these
> error types are not currently  allowed through the memory range
> validation done by the EINJ driver.
> 
> The MMIO address of a CXL 1.1 downstream port can be found in the
> cxl_rcrb_addr file in the corresponding dport directory under
> /sys/bus/cxl/devices/portX. CXL 1.1 error types follow the same
> procedure as a memory error type, but with param1 set to the
> downstream port MMIO address.
> 
> Example usage:
> $ cd /sys/kernel/debug/apei/einj
> $ cat available_error_type
>     0x00000008      Memory Correctable
>     0x00000010      Memory Uncorrectable non-fatal
>     0x00000020      Memory Uncorrectable fatal
>     0x00000040      PCI Express Correctable
>     0x00000080      PCI Express Uncorrectable non-fatal
>     0x00000100      PCI Express Uncorrectable fatal
>     0x00008000      CXL.mem Protocol Correctable
>     0x00020000      CXL.mem Protocol Uncorrectable fatal
> $ echo 0x8000 > error_type
> $ echo 0xfffffffffffff000 > param2
> $ echo 0x2 > flags
> $ cat /sys/bus/cxl/devices/portX/dportY/cxl_rcrb_addr
> 0xb2f00000
> $ echo 0xb2f00000 > param1

It seems sort of unpleasant to expose the physical base address of the
RCRB.  Is there any way to use a device address or other logical
identifier instead and keep the actual MMIO address internal?  E.g., a
PCI device has a <domain>:<bus>:<device>.<function> address.  I assume
CXL addresses would look similar?

> $ echo 1 > error_inject
> ...

> +static int is_valid_cxl_addr(u64 addr)

Maybe the function name should include a hint that this should be an
RCRB address?  But I don't claim to know the CXL architecture or
nomenclature.

Bjorn
Dan Williams Sept. 26, 2023, 9:36 p.m. UTC | #4
Ben Cheatham wrote:
> Add support for CXL EINJ error types for CXL 1.1 hosts added in ACPI
> v6.5. Because these error types target memory-mapped CXL 1.1 compliant
> downstream ports and not physical (normal/persistent) memory, these
> error types are not currently  allowed through the memory range
> validation done by the EINJ driver.
> 
> The MMIO address of a CXL 1.1 downstream port can be found in the
> cxl_rcrb_addr file in the corresponding dport directory under
> /sys/bus/cxl/devices/portX. CXL 1.1 error types follow the same
> procedure as a memory error type, but with param1 set to the
> downstream port MMIO address.
> 
> Example usage:
> $ cd /sys/kernel/debug/apei/einj
> $ cat available_error_type
>     0x00000008      Memory Correctable
>     0x00000010      Memory Uncorrectable non-fatal
>     0x00000020      Memory Uncorrectable fatal
>     0x00000040      PCI Express Correctable
>     0x00000080      PCI Express Uncorrectable non-fatal
>     0x00000100      PCI Express Uncorrectable fatal
>     0x00008000      CXL.mem Protocol Correctable
>     0x00020000      CXL.mem Protocol Uncorrectable fatal
> $ echo 0x8000 > error_type
> $ echo 0xfffffffffffff000 > param2
> $ echo 0x2 > flags
> $ cat /sys/bus/cxl/devices/portX/dportY/cxl_rcrb_addr
> 0xb2f00000
> $ echo 0xb2f00000 > param1
> $ echo 1 > error_inject

I have the same reaction to this as I did before:

http://lore.kernel.org/r/647817212bcf1_e067a2945@dwillia2-xfh.jf.intel.com.notmuch

Why is per-port error injection being driven from this legacy global
interface where userspace needs to take information from sysfs and walk
it over to this other interface? Especially since "rcrb" is an
implementation detail that will be invalidated with CXL VH topologies?

What I would like to see, since this is a new capability with no need to
be beholden to legacy is to disaggregate the interface to be per-port.

For example:

/sys/kernel/debug/cxl/$mem/{inject,clear}_poison is already established
for memory device poison injection. Why not add something like:

/sys/kernel/debug/cxl/$port/einj_{type,inject}

For triggering errors by the CXL subsystem device name, and unburden
userspace from needing to deal in magic numbers.
Ben Cheatham Sept. 27, 2023, 3:31 p.m. UTC | #5
On 9/26/23 3:22 PM, Bjorn Helgaas wrote:
> On Mon, Sep 25, 2023 at 03:01:26PM -0500, Ben Cheatham wrote:
>> Add support for CXL EINJ error types for CXL 1.1 hosts added in ACPI
>> v6.5. Because these error types target memory-mapped CXL 1.1 compliant
>> downstream ports and not physical (normal/persistent) memory, these
>> error types are not currently  allowed through the memory range
>> validation done by the EINJ driver.
>>
>> The MMIO address of a CXL 1.1 downstream port can be found in the
>> cxl_rcrb_addr file in the corresponding dport directory under
>> /sys/bus/cxl/devices/portX. CXL 1.1 error types follow the same
>> procedure as a memory error type, but with param1 set to the
>> downstream port MMIO address.
>>
>> Example usage:
>> $ cd /sys/kernel/debug/apei/einj
>> $ cat available_error_type
>>     0x00000008      Memory Correctable
>>     0x00000010      Memory Uncorrectable non-fatal
>>     0x00000020      Memory Uncorrectable fatal
>>     0x00000040      PCI Express Correctable
>>     0x00000080      PCI Express Uncorrectable non-fatal
>>     0x00000100      PCI Express Uncorrectable fatal
>>     0x00008000      CXL.mem Protocol Correctable
>>     0x00020000      CXL.mem Protocol Uncorrectable fatal
>> $ echo 0x8000 > error_type
>> $ echo 0xfffffffffffff000 > param2
>> $ echo 0x2 > flags
>> $ cat /sys/bus/cxl/devices/portX/dportY/cxl_rcrb_addr
>> 0xb2f00000
>> $ echo 0xb2f00000 > param1
> 
> It seems sort of unpleasant to expose the physical base address of the
> RCRB.  Is there any way to use a device address or other logical
> identifier instead and keep the actual MMIO address internal?  E.g., a
> PCI device has a <domain>:<bus>:<device>.<function> address.  I assume
> CXL addresses would look similar?
> 

I have the MMIO address of the dport exposed here because the ACPI spec
says that the address is expected for CXL 1.1 errors and I wanted to match
the spec. For CXL 2.0 errors the SBDF is expected as you described.

>> $ echo 1 > error_inject
>> ...
> 
>> +static int is_valid_cxl_addr(u64 addr)
> 
> Maybe the function name should include a hint that this should be an
> RCRB address?  But I don't claim to know the CXL architecture or
> nomenclature.
> 

Good point, if I keep this function in v6 I'll make the name more
descriptive.

> Bjorn
Ben Cheatham Sept. 27, 2023, 3:32 p.m. UTC | #6
On 9/26/23 4:36 PM, Dan Williams wrote:
> Ben Cheatham wrote:
>> Add support for CXL EINJ error types for CXL 1.1 hosts added in ACPI
>> v6.5. Because these error types target memory-mapped CXL 1.1 compliant
>> downstream ports and not physical (normal/persistent) memory, these
>> error types are not currently  allowed through the memory range
>> validation done by the EINJ driver.
>>
>> The MMIO address of a CXL 1.1 downstream port can be found in the
>> cxl_rcrb_addr file in the corresponding dport directory under
>> /sys/bus/cxl/devices/portX. CXL 1.1 error types follow the same
>> procedure as a memory error type, but with param1 set to the
>> downstream port MMIO address.
>>
>> Example usage:
>> $ cd /sys/kernel/debug/apei/einj
>> $ cat available_error_type
>>     0x00000008      Memory Correctable
>>     0x00000010      Memory Uncorrectable non-fatal
>>     0x00000020      Memory Uncorrectable fatal
>>     0x00000040      PCI Express Correctable
>>     0x00000080      PCI Express Uncorrectable non-fatal
>>     0x00000100      PCI Express Uncorrectable fatal
>>     0x00008000      CXL.mem Protocol Correctable
>>     0x00020000      CXL.mem Protocol Uncorrectable fatal
>> $ echo 0x8000 > error_type
>> $ echo 0xfffffffffffff000 > param2
>> $ echo 0x2 > flags
>> $ cat /sys/bus/cxl/devices/portX/dportY/cxl_rcrb_addr
>> 0xb2f00000
>> $ echo 0xb2f00000 > param1
>> $ echo 1 > error_inject
> 
> I have the same reaction to this as I did before:
> 
> http://lore.kernel.org/r/647817212bcf1_e067a2945@dwillia2-xfh.jf.intel.com.notmuch
> 
> Why is per-port error injection being driven from this legacy global
> interface where userspace needs to take information from sysfs and walk
> it over to this other interface? Especially since "rcrb" is an
> implementation detail that will be invalidated with CXL VH topologies?
> 

I get what you're saying, I did it this way because I saw this as primarily
an EINJ feature and wanted to keep it in that module. I agree with you though,
it's clunky and would be better inside the cxl module/debugfs.

> What I would like to see, since this is a new capability with no need to
> be beholden to legacy is to disaggregate the interface to be per-port.
> 
> For example:
> 
> /sys/kernel/debug/cxl/$mem/{inject,clear}_poison is already established
> for memory device poison injection. Why not add something like:
> 
> /sys/kernel/debug/cxl/$port/einj_{type,inject}
> 
> For triggering errors by the CXL subsystem device name, and unburden
> userspace from needing to deal in magic numbers.

I'll go ahead and move everything over to the cxl debugfs and do what you're
suggesting. Thanks for taking the time to take a look!

Thanks,
Ben
diff mbox series

Patch

diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 6b18f8bc7be3..eb9cc7157342 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -55,6 +55,8 @@  config ACPI_APEI_MEMORY_FAILURE
 config ACPI_APEI_EINJ
 	tristate "APEI Error INJection (EINJ)"
 	depends on ACPI_APEI && DEBUG_FS
+	imply CXL_BUS
+	imply CXL_ACPI
 	help
 	  EINJ provides a hardware error injection mechanism, it is
 	  mainly used for debugging and testing the other parts of
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index 013eb621dc92..8000417a5f26 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -21,6 +21,7 @@ 
 #include <linux/nmi.h>
 #include <linux/delay.h>
 #include <linux/mm.h>
+#include <linux/cxl.h>
 #include <asm/unaligned.h>
 
 #include "apei-internal.h"
@@ -36,6 +37,7 @@ 
 #define MEM_ERROR_MASK		(ACPI_EINJ_MEMORY_CORRECTABLE | \
 				ACPI_EINJ_MEMORY_UNCORRECTABLE | \
 				ACPI_EINJ_MEMORY_FATAL)
+#define CXL_ERROR_MASK		GENMASK(17, 12)
 
 /*
  * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action.
@@ -512,6 +514,22 @@  static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 	return rc;
 }
 
+static int is_valid_cxl_addr(u64 addr)
+{
+	struct cxl_dport *dport;
+
+	if (IS_REACHABLE(CONFIG_CXL_ACPI))
+		dport = cxl_find_rch_dport_by_rcrb((resource_size_t) addr);
+	else
+		return 0;
+
+	if (!IS_ERR_OR_NULL(dport))
+		return 1;
+
+	pr_info("Could not find dport with rcrb 0x%llx\n", addr);
+	return 0;
+}
+
 /* Inject the specified hardware error */
 static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 			     u64 param3, u64 param4)
@@ -537,8 +555,11 @@  static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 	if (type & ACPI5_VENDOR_BIT) {
 		if (vendor_flags != SETWA_FLAGS_MEM)
 			goto inject;
-	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM))
+	} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM)) {
 		goto inject;
+	} else if (type & CXL_ERROR_MASK && is_valid_cxl_addr(param1)) {
+		goto inject;
+	}
 
 	/*
 	 * Disallow crazy address masks that give BIOS leeway to pick
@@ -807,3 +828,4 @@  module_exit(einj_exit);
 MODULE_AUTHOR("Huang Ying");
 MODULE_DESCRIPTION("APEI Error INJection support");
 MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(CXL);
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 3e2ca946bf47..a7adf3d9814e 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -756,6 +756,7 @@  static void __exit cxl_acpi_exit(void)
 {
 	platform_driver_unregister(&cxl_acpi_driver);
 	cxl_bus_drain();
+	set_cxl_root(NULL);
 }
 
 /* load before dax_hmem sees 'Soft Reserved' CXL ranges */
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index c3914e73f67e..cb653cb1a1ea 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -1147,6 +1147,23 @@  struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
 }
 EXPORT_SYMBOL_NS_GPL(devm_cxl_add_rch_dport, CXL);
 
+struct cxl_dport *cxl_find_rch_dport_by_rcrb(resource_size_t rcrb_base)
+{
+	struct cxl_dport *dport;
+	unsigned long index;
+
+	if (!cxl_root)
+		return ERR_PTR(-ENODEV);
+
+	xa_for_each(&cxl_root->dports, index, dport)
+		if ((dport->rch && dport->rcrb.base != CXL_RESOURCE_NONE)
+		    && dport->rcrb.base == rcrb_base)
+			return dport;
+
+	return NULL;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_find_rch_dport_by_rcrb, CXL);
+
 static int add_ep(struct cxl_ep *new)
 {
 	struct cxl_port *port = new->dport->port;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 4d5bce4bae7e..3e6779dbcd23 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -8,6 +8,7 @@ 
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
 #include <linux/log2.h>
+#include <linux/cxl.h>
 #include <linux/io.h>
 
 /**
diff --git a/include/linux/cxl.h b/include/linux/cxl.h
new file mode 100644
index 000000000000..6702caa78e7a
--- /dev/null
+++ b/include/linux/cxl.h
@@ -0,0 +1,11 @@ 
+#ifndef _LINUX_CXL_H
+#define _LINUX_CXL_H
+
+#include <linux/xarray.h>
+#include <linux/errno.h>
+
+struct cxl_dport;
+
+struct cxl_dport *cxl_find_rch_dport_by_rcrb(resource_size_t rcrb_base);
+
+#endif