diff mbox series

[2/3] cxl/mbox: Add GET_POISON_LIST mailbox command support

Message ID 382a9c35ef43e89db85670637d88371f9197b7a2.1655250669.git.alison.schofield@intel.com
State New, archived
Headers show
Series CXL Poison List Retrieval & Tracing | expand

Commit Message

Alison Schofield June 15, 2022, 12:10 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

CXL devices that support persistent memory maintain a list of locations
that are poisoned or result in poison if the addresses are accessed by
the host.

Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
list as a set of  Media Error Records that include the source of the
error, the starting device physical address and length. The length is
the number of adjacent DPAs in the record and is in units of 64 bytes.

Retrieve the list and log each Media Error Record as a trace event of
type cxl_poison_list.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 drivers/cxl/cxlmem.h    | 43 +++++++++++++++++++++++
 drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

Comments

Ira Weiny June 15, 2022, 3:22 a.m. UTC | #1
On Tue, Jun 14, 2022 at 05:10:27PM -0700, Alison Schofield wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices that support persistent memory maintain a list of locations
> that are poisoned or result in poison if the addresses are accessed by
> the host.
> 
> Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> list as a set of  Media Error Records that include the source of the
> error, the starting device physical address and length. The length is
> the number of adjacent DPAs in the record and is in units of 64 bytes.
> 
> Retrieve the list and log each Media Error Record as a trace event of
> type cxl_poison_list.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/cxl/cxlmem.h    | 43 +++++++++++++++++++++++
>  drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 118 insertions(+)
> 
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 60d10ee1e7fc..29cf0459b44a 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info {
>   *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
>   * @lsa_size: Size of Label Storage Area
>   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
> + * @poison_max_mer: maximum Media Error Records tracked in Poison List

Does not match the member name below.

Ira

>   * @mbox_mutex: Mutex to synchronize mailbox access.
>   * @firmware_version: Firmware version for the memory device.
>   * @enabled_cmds: Hardware commands found enabled in CEL.
> @@ -204,6 +205,7 @@ struct cxl_dev_state {
>  
>  	size_t payload_size;
>  	size_t lsa_size;
> +	u32 poison_max;
>  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>  	char firmware_version[0x10];
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> @@ -317,6 +319,46 @@ struct cxl_mbox_set_partition_info {
>  
>  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
>  
> +struct cxl_mbox_poison_payload_in {
> +	__le64 offset;
> +	__le64 length;
> +} __packed;
> +
> +struct cxl_mbox_poison_payload_out {
> +	u8 flags;
> +	u8 rsvd1;
> +	__le64 overflow_timestamp;
> +	__le16 count;
> +	u8 rsvd2[0x14];
> +	struct cxl_poison_record {
> +		__le64 address;
> +		__le32 length;
> +		__le32 rsvd;
> +	} __packed record[];
> +} __packed;
> +
> +/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */
> +#define CXL_POISON_FLAG_MORE            BIT(0)
> +#define CXL_POISON_FLAG_OVERFLOW        BIT(1)
> +#define CXL_POISON_FLAG_SCANNING        BIT(2)
> +
> +/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */
> +#define CXL_POISON_SOURCE_MASK		GENMASK(2, 0)
> +#define	CXL_POISON_SOURCE_UNKNOWN	0
> +#define	CXL_POISON_SOURCE_EXTERNAL	1
> +#define	CXL_POISON_SOURCE_INTERNAL	2
> +#define	CXL_POISON_SOURCE_INJECTED	3
> +#define	CXL_POISON_SOURCE_VENDOR	7
> +
> +/* Software define */
> +#define	CXL_POISON_SOURCE_INVALID	99
> +#define CXL_POISON_SOURCE_VALID(x)		\
> +	(((x) == CXL_POISON_SOURCE_UNKNOWN)  ||	\
> +	 ((x) == CXL_POISON_SOURCE_EXTERNAL) ||	\
> +	 ((x) == CXL_POISON_SOURCE_INTERNAL) ||	\
> +	 ((x) == CXL_POISON_SOURCE_INJECTED) ||	\
> +	 ((x) == CXL_POISON_SOURCE_VENDOR))
> +
>  /**
>   * struct cxl_mem_command - Driver representation of a memory device command
>   * @info: Command information as it exists for the UAPI
> @@ -351,6 +393,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
>  void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
>  void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> +int cxl_mem_get_poison_list(struct device *dev);
>  #ifdef CONFIG_CXL_SUSPEND
>  void cxl_mem_active_inc(void);
>  void cxl_mem_active_dec(void);
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 54f434733b56..c10c7020ebc2 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -9,6 +9,9 @@
>  
>  #include "core.h"
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/cxl.h>
> +
>  static bool cxl_raw_allow_all;
>  
>  /**
> @@ -755,6 +758,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>  {
>  	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
>  	struct cxl_mbox_identify id;
> +	__le32 val = 0;
>  	int rc;
>  
>  	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
> @@ -783,6 +787,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>  	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
>  	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
>  
> +	memcpy(&val, id.poison_list_max_mer, 3);
> +	cxlds->poison_max = le32_to_cpu(val);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
> @@ -826,6 +833,74 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
>  
> +int cxl_mem_get_poison_list(struct device *dev)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct cxl_mbox_poison_payload_out *po;
> +	struct cxl_mbox_poison_payload_in pi;
> +	int nr_records = 0;
> +	int rc, i;
> +
> +	if (range_len(&cxlds->pmem_range)) {
> +		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> +		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
> +	} else {
> +		return -ENXIO;
> +	}
> +
> +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> +	if (!po)
> +		return -ENOMEM;
> +
> +	do {
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> +				       sizeof(pi), po, cxlds->payload_size);
> +		if (rc)
> +			goto out;
> +
> +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> +
> +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> +				&o_time);
> +			rc = -ENXIO;
> +			goto out;

I guess the idea is that this return will trigger something else will clear the list,
rebuild the list, and perform a scan media request?

I'm just wondering if this loop should continue to clear the list and then let
something else do the scan media request?

Other than that question and the above typo.  Looks good!

Ira

> +		}
> +
> +		if (po->flags & CXL_POISON_FLAG_SCANNING) {
> +			dev_err(dev, "Scan Media in Progress\n");
> +			rc = -EBUSY;
> +			goto out;
> +		}
> +
> +		for (i = 0; i < le16_to_cpu(po->count); i++) {
> +			u64 addr = le64_to_cpu(po->record[i].address);
> +			u32 len = le32_to_cpu(po->record[i].length);
> +			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
> +
> +			if (!CXL_POISON_SOURCE_VALID(source)) {
> +				dev_dbg(dev, "Invalid poison source %d",
> +					source);
> +				source = CXL_POISON_SOURCE_INVALID;
> +			}
> +
> +			trace_cxl_poison_list(dev, source, addr, len);
> +		}
> +
> +		/* Protect against an uncleared _FLAG_MORE */
> +		nr_records = nr_records + le16_to_cpu(po->count);
> +		if (nr_records >= cxlds->poison_max)
> +			goto out;
> +
> +	} while (po->flags & CXL_POISON_FLAG_MORE);
> +
> +out:
> +	kvfree(po);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
> +
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>  {
>  	struct cxl_dev_state *cxlds;
> -- 
> 2.31.1
>
Alison Schofield June 15, 2022, 5:07 a.m. UTC | #2
On Tue, Jun 14, 2022 at 08:22:41PM -0700, Ira Weiny wrote:

Thanks for the review Ira...

> On Tue, Jun 14, 2022 at 05:10:27PM -0700, Alison Schofield wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 

snip

> > 
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 60d10ee1e7fc..29cf0459b44a 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info {
> >   *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
> >   * @lsa_size: Size of Label Storage Area
> >   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
> > + * @poison_max_mer: maximum Media Error Records tracked in Poison List
> 
> Does not match the member name below.
Got it! I intended to drop the _mer.

> 
> Ira
> 
> >   * @mbox_mutex: Mutex to synchronize mailbox access.
> >   * @firmware_version: Firmware version for the memory device.
> >   * @enabled_cmds: Hardware commands found enabled in CEL.
> > @@ -204,6 +205,7 @@ struct cxl_dev_state {
> >  
> >  	size_t payload_size;
> >  	size_t lsa_size;
> > +	u32 poison_max;
> >  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
> >  	char firmware_version[0x10];
> >  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);

snip

> > +int cxl_mem_get_poison_list(struct device *dev)
> > +{
> > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +	struct cxl_mbox_poison_payload_out *po;
> > +	struct cxl_mbox_poison_payload_in pi;
> > +	int nr_records = 0;
> > +	int rc, i;
> > +
> > +	if (range_len(&cxlds->pmem_range)) {
> > +		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> > +		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
> > +	} else {
> > +		return -ENXIO;
> > +	}
> > +
> > +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> > +	if (!po)
> > +		return -ENOMEM;
> > +
> > +	do {
> > +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> > +				       sizeof(pi), po, cxlds->payload_size);
> > +		if (rc)
> > +			goto out;
> > +
> > +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> > +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> > +
> > +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> > +				&o_time);
> > +			rc = -ENXIO;
> > +			goto out;
> 
> I guess the idea is that this return will trigger something else will clear the list,
> rebuild the list, and perform a scan media request?
> 
Per CXL Spec 8.2.9.5.4.1: The poison list may be incomplete when the list
has overflowed. User can perform a Scan Media to try to clear and rebuild
the list, with no guarantee that the overflow will not recur.

So yes to what you are saying. This return value should indicate to
user space that a Scan Media should be issued. Issuing the Scan Media
to the device does lead the device to rebuild it's list, as you say.
Also, when we get the Scan Media results, the device is able to report
partial results and tell the host to collect the error records, and
then restart the scan, get results again, and on and on until the scan
is complete.

Perhaps a clarification - there is not a logical pairing of Scan Media
followed by Get Poison List.  Scan Media followed by Get Scan Media
Results is the logical pairing. Get Poison List is getting a snapshot
of the poison list at a point in time. The device adds DPAs to the list
when the device detects poison, some devices run their own backround
scans and add to the poison list, and then there are the user initiated
actions (Scan Media and Poison Inject) that can affect the list.

> I'm just wondering if this loop should continue to clear the list and then let
> something else do the scan media request?

It's not like the _MORE status where the device is telling the host to
come back and gather more. I think the action of failing, and letting
user initiated a Scan Media is correct course here.

So, this response got kind of long winded. As you can see, especially
if one looks in the spec as I know you are, there are additional
commands that need to be implemented to complete the ARS feature set.
And, of course, we'll offer user space tooling (NDCTL and libcxl).

> 
> Other than that question and the above typo.  Looks good!
> 
> Ira
> 
> > +		}
> > +
> > +		if (po->flags & CXL_POISON_FLAG_SCANNING) {
> > +			dev_err(dev, "Scan Media in Progress\n");
> > +			rc = -EBUSY;
> > +			goto out;
> > +		}
> > +
> > +		for (i = 0; i < le16_to_cpu(po->count); i++) {
> > +			u64 addr = le64_to_cpu(po->record[i].address);
> > +			u32 len = le32_to_cpu(po->record[i].length);
> > +			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
> > +
> > +			if (!CXL_POISON_SOURCE_VALID(source)) {
> > +				dev_dbg(dev, "Invalid poison source %d",
> > +					source);
> > +				source = CXL_POISON_SOURCE_INVALID;
> > +			}
> > +
> > +			trace_cxl_poison_list(dev, source, addr, len);
> > +		}
> > +
> > +		/* Protect against an uncleared _FLAG_MORE */
> > +		nr_records = nr_records + le16_to_cpu(po->count);
> > +		if (nr_records >= cxlds->poison_max)
> > +			goto out;
> > +
> > +	} while (po->flags & CXL_POISON_FLAG_MORE);
> > +
> > +out:
> > +	kvfree(po);
> > +	return rc;
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
> > +
> >  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
> >  {
> >  	struct cxl_dev_state *cxlds;
> > -- 
> > 2.31.1
> >
Ira Weiny June 15, 2022, 3:01 p.m. UTC | #3
On Tue, Jun 14, 2022 at 10:07:52PM -0700, Alison Schofield wrote:
> On Tue, Jun 14, 2022 at 08:22:41PM -0700, Ira Weiny wrote:
> 

[snip]

> > > +
> > > +	do {
> > > +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> > > +				       sizeof(pi), po, cxlds->payload_size);
> > > +		if (rc)
> > > +			goto out;
> > > +
> > > +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> > > +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> > > +
> > > +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> > > +				&o_time);
> > > +			rc = -ENXIO;
> > > +			goto out;
> > 
> > I guess the idea is that this return will trigger something else will clear the list,
> > rebuild the list, and perform a scan media request?
> > 
> Per CXL Spec 8.2.9.5.4.1: The poison list may be incomplete when the list
> has overflowed. User can perform a Scan Media to try to clear and rebuild
> the list, with no guarantee that the overflow will not recur.
> 
> So yes to what you are saying. This return value should indicate to
> user space that a Scan Media should be issued. Issuing the Scan Media
> to the device does lead the device to rebuild it's list, as you say.
> Also, when we get the Scan Media results, the device is able to report
> partial results and tell the host to collect the error records, and
> then restart the scan, get results again, and on and on until the scan
> is complete.
> 
> Perhaps a clarification - there is not a logical pairing of Scan Media
> followed by Get Poison List.  Scan Media followed by Get Scan Media
> Results is the logical pairing. Get Poison List is getting a snapshot
> of the poison list at a point in time. The device adds DPAs to the list
> when the device detects poison, some devices run their own backround
> scans and add to the poison list, and then there are the user initiated
> actions (Scan Media and Poison Inject) that can affect the list.
> 
> > I'm just wondering if this loop should continue to clear the list and then let
> > something else do the scan media request?
> 
> It's not like the _MORE status where the device is telling the host to
> come back and gather more. I think the action of failing, and letting
> user initiated a Scan Media is correct course here.

Fair enough.  But I guess I'm still confused by the spec.  The way I read it
yesterday (and I could be wrong) was that the OS was supposed to read the
entries to clear the list?  Is that not true?

I the device will clear the list internally when Scan Media is run?

At this point I'm just trying to understand not necessarily objecting to the
patch.

Ira

> 
> So, this response got kind of long winded. As you can see, especially
> if one looks in the spec as I know you are, there are additional
> commands that need to be implemented to complete the ARS feature set.
> And, of course, we'll offer user space tooling (NDCTL and libcxl).
>
Alison Schofield June 15, 2022, 5:19 p.m. UTC | #4
On Wed, Jun 15, 2022 at 08:01:50AM -0700, Ira Weiny wrote:
> On Tue, Jun 14, 2022 at 10:07:52PM -0700, Alison Schofield wrote:
> > On Tue, Jun 14, 2022 at 08:22:41PM -0700, Ira Weiny wrote:
> > 
> 
> [snip]
> 
> > > > +
> > > > +	do {
> > > > +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> > > > +				       sizeof(pi), po, cxlds->payload_size);
> > > > +		if (rc)
> > > > +			goto out;
> > > > +
> > > > +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> > > > +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> > > > +
> > > > +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> > > > +				&o_time);
> > > > +			rc = -ENXIO;
> > > > +			goto out;
> > > 
> > > I guess the idea is that this return will trigger something else will clear the list,
> > > rebuild the list, and perform a scan media request?
> > > 
> > Per CXL Spec 8.2.9.5.4.1: The poison list may be incomplete when the list
> > has overflowed. User can perform a Scan Media to try to clear and rebuild
> > the list, with no guarantee that the overflow will not recur.
> > 
> > So yes to what you are saying. This return value should indicate to
> > user space that a Scan Media should be issued. Issuing the Scan Media
> > to the device does lead the device to rebuild it's list, as you say.
> > Also, when we get the Scan Media results, the device is able to report
> > partial results and tell the host to collect the error records, and
> > then restart the scan, get results again, and on and on until the scan
> > is complete.
> > 
> > Perhaps a clarification - there is not a logical pairing of Scan Media
> > followed by Get Poison List.  Scan Media followed by Get Scan Media
> > Results is the logical pairing. Get Poison List is getting a snapshot
> > of the poison list at a point in time. The device adds DPAs to the list
> > when the device detects poison, some devices run their own backround
> > scans and add to the poison list, and then there are the user initiated
> > actions (Scan Media and Poison Inject) that can affect the list.
> > 
> > > I'm just wondering if this loop should continue to clear the list and then let
> > > something else do the scan media request?
> > 
> > It's not like the _MORE status where the device is telling the host to
> > come back and gather more. I think the action of failing, and letting
> > user initiated a Scan Media is correct course here.
> 
> Fair enough.  But I guess I'm still confused by the spec.  The way I read it
> yesterday (and I could be wrong) was that the OS was supposed to read the
> entries to clear the list?  Is that not true?

I think - not true.

Get_Poison_List has no effect on the contents of the list itself.
Even with its MORE flag, it is not clearing any poison, it's just
telling the host that it had more records than could fit in one
device payload so they will have to delivered to the host in multiple
requests. I'd expect issuing multiple Get Poison List requests would
get same results.  (unless of course the media was going bad quickly

Maybe you are conflating w other cmds: Scan Media & Clear Poison

> 
> I the device will clear the list internally when Scan Media is run?

Spec says device 'rebuids' the list. I might guess that's a clear and
start anew, but not the hosts business. As a host, we wait for Scan
Media to complete before issuing Get Scan Media Results or Get Poison
List.
> 
> At this point I'm just trying to understand not necessarily objecting to the
> patch.

NP.  The questions are helpful!

> 
> Ira
> 
> > 
> > So, this response got kind of long winded. As you can see, especially
> > if one looks in the spec as I know you are, there are additional
> > commands that need to be implemented to complete the ARS feature set.
> > And, of course, we'll offer user space tooling (NDCTL and libcxl).
> >
Davidlohr Bueso June 16, 2022, 7:43 p.m. UTC | #5
On Tue, 14 Jun 2022, alison.schofield@intel.com wrote:

>From: Alison Schofield <alison.schofield@intel.com>
>
>CXL devices that support persistent memory maintain a list of locations
>that are poisoned or result in poison if the addresses are accessed by
>the host.
>
>Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
>list as a set of  Media Error Records that include the source of the
>error, the starting device physical address and length. The length is
>the number of adjacent DPAs in the record and is in units of 64 bytes.
>
>Retrieve the list and log each Media Error Record as a trace event of
>type cxl_poison_list.
>
>Signed-off-by: Alison Schofield <alison.schofield@intel.com>
>---
> drivers/cxl/cxlmem.h    | 43 +++++++++++++++++++++++
> drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 118 insertions(+)
>
>diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>index 60d10ee1e7fc..29cf0459b44a 100644
>--- a/drivers/cxl/cxlmem.h
>+++ b/drivers/cxl/cxlmem.h
>@@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info {
>  *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
>  * @lsa_size: Size of Label Storage Area
>  *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
>+ * @poison_max_mer: maximum Media Error Records tracked in Poison List
>  * @mbox_mutex: Mutex to synchronize mailbox access.
>  * @firmware_version: Firmware version for the memory device.
>  * @enabled_cmds: Hardware commands found enabled in CEL.
>@@ -204,6 +205,7 @@ struct cxl_dev_state {
>
>	size_t payload_size;
>	size_t lsa_size;
>+	u32 poison_max;
>	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>	char firmware_version[0x10];
>	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>@@ -317,6 +319,46 @@ struct cxl_mbox_set_partition_info {
>
> #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
>
>+struct cxl_mbox_poison_payload_in {
>+	__le64 offset;
>+	__le64 length;
>+} __packed;
>+
>+struct cxl_mbox_poison_payload_out {
>+	u8 flags;
>+	u8 rsvd1;
>+	__le64 overflow_timestamp;
>+	__le16 count;
>+	u8 rsvd2[0x14];
>+	struct cxl_poison_record {
>+		__le64 address;
>+		__le32 length;
>+		__le32 rsvd;
>+	} __packed record[];
>+} __packed;
>+
>+/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */
>+#define CXL_POISON_FLAG_MORE            BIT(0)
>+#define CXL_POISON_FLAG_OVERFLOW        BIT(1)
>+#define CXL_POISON_FLAG_SCANNING        BIT(2)
>+
>+/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */
>+#define CXL_POISON_SOURCE_MASK		GENMASK(2, 0)
>+#define	CXL_POISON_SOURCE_UNKNOWN	0
>+#define	CXL_POISON_SOURCE_EXTERNAL	1
>+#define	CXL_POISON_SOURCE_INTERNAL	2
>+#define	CXL_POISON_SOURCE_INJECTED	3
>+#define	CXL_POISON_SOURCE_VENDOR	7
>+
>+/* Software define */
>+#define	CXL_POISON_SOURCE_INVALID	99
>+#define CXL_POISON_SOURCE_VALID(x)		\
>+	(((x) == CXL_POISON_SOURCE_UNKNOWN)  ||	\
>+	 ((x) == CXL_POISON_SOURCE_EXTERNAL) ||	\
>+	 ((x) == CXL_POISON_SOURCE_INTERNAL) ||	\
>+	 ((x) == CXL_POISON_SOURCE_INJECTED) ||	\
>+	 ((x) == CXL_POISON_SOURCE_VENDOR))
>+
> /**
>  * struct cxl_mem_command - Driver representation of a memory device command
>  * @info: Command information as it exists for the UAPI
>@@ -351,6 +393,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
> struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
> void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
>+int cxl_mem_get_poison_list(struct device *dev);
> #ifdef CONFIG_CXL_SUSPEND
> void cxl_mem_active_inc(void);
> void cxl_mem_active_dec(void);
>diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
>index 54f434733b56..c10c7020ebc2 100644
>--- a/drivers/cxl/core/mbox.c
>+++ b/drivers/cxl/core/mbox.c
>@@ -9,6 +9,9 @@
>
> #include "core.h"
>
>+#define CREATE_TRACE_POINTS
>+#include <trace/events/cxl.h>
>+
> static bool cxl_raw_allow_all;
>
> /**
>@@ -755,6 +758,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
> {
>	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
>	struct cxl_mbox_identify id;
>+	__le32 val = 0;
>	int rc;
>
>	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
>@@ -783,6 +787,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
>	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
>
>+	memcpy(&val, id.poison_list_max_mer, 3);
>+	cxlds->poison_max = le32_to_cpu(val);
>+
>	return 0;
> }
> EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
>@@ -826,6 +833,74 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
> }
> EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
>
>+int cxl_mem_get_poison_list(struct device *dev)
>+{
>+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
>+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>+	struct cxl_mbox_poison_payload_out *po;
>+	struct cxl_mbox_poison_payload_in pi;
>+	int nr_records = 0;
>+	int rc, i;
>+
>+	if (range_len(&cxlds->pmem_range)) {
>+		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
>+		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));

Do you ever see this changing to not always use the full pmem DPA range
but allow arbitrary ones? I also assume this is the reason why you don't
check the range vs cxlds->ram_range to prevent any overlaps, no?

Thanks,
Davidlohr

>+	} else {
>+		return -ENXIO;
>+	}
>+
>+	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
>+	if (!po)
>+		return -ENOMEM;
>+
>+	do {
>+		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
>+				       sizeof(pi), po, cxlds->payload_size);
>+		if (rc)
>+			goto out;
>+
>+		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
>+			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
>+
>+			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
>+				&o_time);
>+			rc = -ENXIO;
>+			goto out;
>+		}
>+
>+		if (po->flags & CXL_POISON_FLAG_SCANNING) {
>+			dev_err(dev, "Scan Media in Progress\n");
>+			rc = -EBUSY;
>+			goto out;
>+		}
>+
>+		for (i = 0; i < le16_to_cpu(po->count); i++) {
>+			u64 addr = le64_to_cpu(po->record[i].address);
>+			u32 len = le32_to_cpu(po->record[i].length);
>+			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
>+
>+			if (!CXL_POISON_SOURCE_VALID(source)) {
>+				dev_dbg(dev, "Invalid poison source %d",
>+					source);
>+				source = CXL_POISON_SOURCE_INVALID;
>+			}
>+
>+			trace_cxl_poison_list(dev, source, addr, len);
>+		}
>+
>+		/* Protect against an uncleared _FLAG_MORE */
>+		nr_records = nr_records + le16_to_cpu(po->count);
>+		if (nr_records >= cxlds->poison_max)
>+			goto out;
>+
>+	} while (po->flags & CXL_POISON_FLAG_MORE);
>+
>+out:
>+	kvfree(po);
>+	return rc;
>+}
>+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
>+
> struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
> {
>	struct cxl_dev_state *cxlds;
>
>--
>2.31.1
>
Alison Schofield June 16, 2022, 8:34 p.m. UTC | #6
On Thu, Jun 16, 2022 at 12:43:34PM -0700, Davidlohr Bueso wrote:
> On Tue, 14 Jun 2022, alison.schofield@intel.com wrote:
> 
> >From: Alison Schofield <alison.schofield@intel.com>
> >
> >CXL devices that support persistent memory maintain a list of locations
> >that are poisoned or result in poison if the addresses are accessed by
> >the host.
> >
> >Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> >list as a set of  Media Error Records that include the source of the
> >error, the starting device physical address and length. The length is
> >the number of adjacent DPAs in the record and is in units of 64 bytes.
> >
> >Retrieve the list and log each Media Error Record as a trace event of
> >type cxl_poison_list.
> >
> >Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> >---
> > drivers/cxl/cxlmem.h    | 43 +++++++++++++++++++++++
> > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 118 insertions(+)
> >
snip

> >+int cxl_mem_get_poison_list(struct device *dev)
> >+{
> >+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> >+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> >+	struct cxl_mbox_poison_payload_out *po;
> >+	struct cxl_mbox_poison_payload_in pi;
> >+	int nr_records = 0;
> >+	int rc, i;
> >+
> >+	if (range_len(&cxlds->pmem_range)) {
> >+		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> >+		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));

First off - you stopped at a bug here - that pi.length needs to be
in units of 64 bytes.
> 
> Do you ever see this changing to not always use the full pmem DPA range
> but allow arbitrary ones? I also assume this is the reason why you don't
> check the range vs cxlds->ram_range to prevent any overlaps, no?
> 
> Thanks,
> Davidlohr

David - Great question!

I'm headed in this direction -

cxl list --media-errors -m mem1
	lists media errors for requested memdev

cxl list --media-errors -r region#
	lists region errors with HPA addresses
	(So here cxl tool will collect the poison for all the regions
	 memdevs and do the DPA to HPA translation)

To answer your question, I wasn't thinking of limiting
the range within the memdev, but certainly could. And if we were
taking in ranges, those ranges would need to be checked.

$cxl list --media-errors -m mem1 --range-start=  --range-end|len=

Now, if I left the sysfs inteface as is, the driver will read the 
entire poison list for the memdev and then cxl tool will filter it
for the range requested. 

Or, maybe we should implement in libcxl (not sysfs), with memdev and
range options and only collect from the device the range requested.

Either one looks the same to the cxl tool user, but limiting the
range we send to the device would certainly cut down on unwanted
records being logged, retrieved, and examined.

I'd like to hear more from you and other community members.

Alison

> > snip
Davidlohr Bueso June 16, 2022, 9:47 p.m. UTC | #7
On Thu, 16 Jun 2022, Alison Schofield wrote:
>I'm headed in this direction -

I like these interfaces, btw.

>cxl list --media-errors -m mem1
>	lists media errors for requested memdev

But in this patchset you're only listing for persistent configurations.
So if there is a volatile partion, or the whole device is volatile,
this would not consider that.

So unless I'm missing something, we need to consider ram_range as well.

>cxl list --media-errors -r region#
>	lists region errors with HPA addresses
>	(So here cxl tool will collect the poison for all the regions
>	 memdevs and do the DPA to HPA translation)

I was indeed thinking along these lines. But similar to the above,
the region driver also has plans to enumarate volatile regions
configured by BIOS.

>
>To answer your question, I wasn't thinking of limiting
>the range within the memdev, but certainly could. And if we were
>taking in ranges, those ranges would need to be checked.

My question was originally considering poisoning only within pmem DPA
ranges, but now I'm wondering if all this also applies equally to volatile
parts as well... Reading the spec I interpret both, but reading the
T3 Memory Device Software Guide '2.13.19' it only mentions persistent
capacity.

>
>$cxl list --media-errors -m mem1 --range-start=  --range-end|len=

I figure this kind of like the above with regions being very arbitrary
and dynamic.

>Now, if I left the sysfs interface as is, the driver will read the
>entire poison list for the memdev and then cxl tool will filter it
>for the range requested.
>
>Or, maybe we should implement in libcxl (not sysfs), with memdev and
>range options and only collect from the device the range requested.

I wonder if the latter may be the better option considering that always
scanning the entire memdev would cause unnecessary media scan wait times,
specially for large capacities.

Thanks,
Davidlohr
Alison Schofield June 16, 2022, 10:10 p.m. UTC | #8
David - you make lots of good points, one quick comments at end...

On Thu, Jun 16, 2022 at 02:47:40PM -0700, Davidlohr Bueso wrote:
> On Thu, 16 Jun 2022, Alison Schofield wrote:
> >I'm headed in this direction -
> 
> I like these interfaces, btw.
> 
> >cxl list --media-errors -m mem1
> >	lists media errors for requested memdev
> 
> But in this patchset you're only listing for persistent configurations.
> So if there is a volatile partion, or the whole device is volatile,
> this would not consider that.
> 
> So unless I'm missing something, we need to consider ram_range as well.
> 
> >cxl list --media-errors -r region#
> >	lists region errors with HPA addresses
> >	(So here cxl tool will collect the poison for all the regions
> >	 memdevs and do the DPA to HPA translation)
> 
> I was indeed thinking along these lines. But similar to the above,
> the region driver also has plans to enumarate volatile regions
> configured by BIOS.
> 
> >
> >To answer your question, I wasn't thinking of limiting
> >the range within the memdev, but certainly could. And if we were
> >taking in ranges, those ranges would need to be checked.
> 
> My question was originally considering poisoning only within pmem DPA
> ranges, but now I'm wondering if all this also applies equally to volatile
> parts as well... Reading the spec I interpret both, but reading the
> T3 Memory Device Software Guide '2.13.19' it only mentions persistent
> capacity.
> 
> >
> >$cxl list --media-errors -m mem1 --range-start=  --range-end|len=
> 
> I figure this kind of like the above with regions being very arbitrary
> and dynamic.
> 
> >Now, if I left the sysfs interface as is, the driver will read the
> >entire poison list for the memdev and then cxl tool will filter it
> >for the range requested.
> >
> >Or, maybe we should implement in libcxl (not sysfs), with memdev and
> >range options and only collect from the device the range requested.
> 
> I wonder if the latter may be the better option considering that always
> scanning the entire memdev would cause unnecessary media scan wait times,
> specially for large capacities.

This is not a Media Scan. This is only reading the existing Poison List.

> 
> Thanks,
> Davidlohr
Davidlohr Bueso June 16, 2022, 10:20 p.m. UTC | #9
On Thu, 16 Jun 2022, Alison Schofield wrote:
>> I wonder if the latter may be the better option considering that always
>> scanning the entire memdev would cause unnecessary media scan wait times,
>> specially for large capacities.
>
>This is not a Media Scan. This is only reading the existing Poison List.

Right, but I was thinking we'd want a input similar interface passing the
desidered range.

Thanks,
Davidlohr
Davidlohr Bueso June 16, 2022, 10:45 p.m. UTC | #10
On Thu, 16 Jun 2022, Alison Schofield wrote:

>cxl list --media-errors -m mem1
>	lists media errors for requested memdev
>
>cxl list --media-errors -r region#

A quick question on the tooling front: the above goes nicely with
cxl-list, but what about the rest of the poisoning cmds? Do you have
anything in mind? Do we want something specific for media and poison
management instead? Ie:

cxl media --list-errors <params>
cxl media --inject-errors <params>
cxl media --clear-errors <params>

Thanks,
Davidlohr
Alison Schofield June 16, 2022, 11:15 p.m. UTC | #11
On Thu, Jun 16, 2022 at 03:45:25PM -0700, Davidlohr Bueso wrote:
> On Thu, 16 Jun 2022, Alison Schofield wrote:
> 
> >cxl list --media-errors -m mem1
> >	lists media errors for requested memdev
> >
> >cxl list --media-errors -r region#
> 
> A quick question on the tooling front: the above goes nicely with
> cxl-list, but what about the rest of the poisoning cmds? Do you have
> anything in mind? Do we want something specific for media and poison
> management instead? Ie:
> 
> cxl media --list-errors <params>
Not clear how this one differs. Seems like we can get any piece of 
the list w cxl list.

> cxl media --inject-errors <params>
> cxl media --clear-errors <params>
For inject/clear I'd probably start w what ndctl does today.
ndctl inject−error  <namespace> [<options>]
where option -d --uninject performs the clear.

> 
> Thanks,
> Davidlohr
Verma, Vishal L June 16, 2022, 11:44 p.m. UTC | #12
On Thu, 2022-06-16 at 16:15 -0700, Alison Schofield wrote:
> On Thu, Jun 16, 2022 at 03:45:25PM -0700, Davidlohr Bueso wrote:
> > On Thu, 16 Jun 2022, Alison Schofield wrote:
> > 
> > > cxl list --media-errors -m mem1
> > >         lists media errors for requested memdev
> > > 
> > > cxl list --media-errors -r region#
> > 
> > A quick question on the tooling front: the above goes nicely with
> > cxl-list, but what about the rest of the poisoning cmds? Do you
> > have
> > anything in mind? Do we want something specific for media and
> > poison
> > management instead? Ie:
> > 
> > cxl media --list-errors <params>
> Not clear how this one differs. Seems like we can get any piece of 
> the list w cxl list.
> 
> > cxl media --inject-errors <params>
> > cxl media --clear-errors <params>
> For inject/clear I'd probably start w what ndctl does today.
> ndctl inject−error  <namespace> [<options>]
> where option -d --uninject performs the clear.

Yeah agreed with Alison that cxl inject-error <options> sounds good.
Generally speaking, we've tried to avoid the 'sub-command of a
subcommand' situation - such as 'cxl <media> <sub-sub-command>
<options>', instead prefering 'cxl <action-object> <options>'.

> 
> > 
> > Thanks,
> > Davidlohr
Davidlohr Bueso June 17, 2022, 12:03 a.m. UTC | #13
On Thu, 16 Jun 2022, Verma, Vishal L wrote:
>Yeah agreed with Alison that cxl inject-error <options> sounds good.

I agree as well, that's a much nicer interface.

Thanks,
Davidlohr
Jonathan Cameron June 17, 2022, 1:01 p.m. UTC | #14
On Tue, 14 Jun 2022 17:10:27 -0700
alison.schofield@intel.com wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices that support persistent memory maintain a list of locations
> that are poisoned or result in poison if the addresses are accessed by
> the host.
> 
> Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> list as a set of  Media Error Records that include the source of the
> error, the starting device physical address and length. The length is
> the number of adjacent DPAs in the record and is in units of 64 bytes.
> 
> Retrieve the list and log each Media Error Record as a trace event of
> type cxl_poison_list.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
 
> +int cxl_mem_get_poison_list(struct device *dev)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct cxl_mbox_poison_payload_out *po;
> +	struct cxl_mbox_poison_payload_in pi;
> +	int nr_records = 0;
> +	int rc, i;
> +
> +	if (range_len(&cxlds->pmem_range)) {
> +		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> +		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
> +	} else {
> +		return -ENXIO;
> +	}
> +
> +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> +	if (!po)
> +		return -ENOMEM;
> +
> +	do {
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> +				       sizeof(pi), po, cxlds->payload_size);
> +		if (rc)
> +			goto out;
> +
> +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> +
> +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> +				&o_time);
> +			rc = -ENXIO;
> +			goto out;
> +		}
> +
> +		if (po->flags & CXL_POISON_FLAG_SCANNING) {
> +			dev_err(dev, "Scan Media in Progress\n");
> +			rc = -EBUSY;
> +			goto out;
> +		}
> +
> +		for (i = 0; i < le16_to_cpu(po->count); i++) {
> +			u64 addr = le64_to_cpu(po->record[i].address);
> +			u32 len = le32_to_cpu(po->record[i].length);
> +			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
> +
> +			if (!CXL_POISON_SOURCE_VALID(source)) {
> +				dev_dbg(dev, "Invalid poison source %d",
> +					source);
> +				source = CXL_POISON_SOURCE_INVALID;
> +			}
> +
> +			trace_cxl_poison_list(dev, source, addr, len);
> +		}
> +
> +		/* Protect against an uncleared _FLAG_MORE */
> +		nr_records = nr_records + le16_to_cpu(po->count);
> +		if (nr_records >= cxlds->poison_max)

If this happens and _FLAG_MORE is set (it will occur anyway currently
if we happen to have poison_max records - I hit this in QEMU because
until now default of poison_max == 0)
then we should spit out an error message as I think that means the
hardware is broken.


> +			goto out;
> +
> +	} while (po->flags & CXL_POISON_FLAG_MORE);
> +
> +out:
> +	kvfree(po);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
> +
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>  {
>  	struct cxl_dev_state *cxlds;
Jonathan Cameron June 17, 2022, 2:05 p.m. UTC | #15
On Tue, 14 Jun 2022 17:10:27 -0700
alison.schofield@intel.com wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices that support persistent memory maintain a list of locations
> that are poisoned or result in poison if the addresses are accessed by
> the host.
> 
> Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> list as a set of  Media Error Records that include the source of the
> error, the starting device physical address and length. The length is
> the number of adjacent DPAs in the record and is in units of 64 bytes.
> 
> Retrieve the list and log each Media Error Record as a trace event of
> type cxl_poison_list.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>

A few more things inline.

Otherwise, can confirm it works with some hack QEMU code.
I'll tidy that up and post soon.

> +int cxl_mem_get_poison_list(struct device *dev)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct cxl_mbox_poison_payload_out *po;
> +	struct cxl_mbox_poison_payload_in pi;
> +	int nr_records = 0;
> +	int rc, i;
> +
> +	if (range_len(&cxlds->pmem_range)) {
> +		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> +		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
> +	} else {
> +		return -ENXIO;
> +	}
> +
> +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> +	if (!po)
> +		return -ENOMEM;
> +
> +	do {
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> +				       sizeof(pi), po, cxlds->payload_size);
> +		if (rc)
> +			goto out;
> +
> +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> +
> +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> +				&o_time);
> +			rc = -ENXIO;
> +			goto out;
> +		}
> +
> +		if (po->flags & CXL_POISON_FLAG_SCANNING) {
> +			dev_err(dev, "Scan Media in Progress\n");
> +			rc = -EBUSY;
> +			goto out;
> +		}
> +
> +		for (i = 0; i < le16_to_cpu(po->count); i++) {
> +			u64 addr = le64_to_cpu(po->record[i].address);

> +			u32 len = le32_to_cpu(po->record[i].length);


> +			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
> +
> +			if (!CXL_POISON_SOURCE_VALID(source)) {
> +				dev_dbg(dev, "Invalid poison source %d",
> +					source);
> +				source = CXL_POISON_SOURCE_INVALID;
> +			}
> +
> +			trace_cxl_poison_list(dev, source, addr, len);

Need to mask off the lower 6 bits of addr as they contain the source
+ a few reserved bits.

I was confused how you were geting better than 64 byte precision in your
example.

> +		}
> +
> +		/* Protect against an uncleared _FLAG_MORE */
> +		nr_records = nr_records + le16_to_cpu(po->count);
> +		if (nr_records >= cxlds->poison_max)
> +			goto out;
> +
> +	} while (po->flags & CXL_POISON_FLAG_MORE);
So.. A conundrum here.  What happens if:

1. We get an error mid way through a set of multiple reads
   (something intermittent - maybe a software issue)
2. We will drop out of here fine and report the error.
3. We run this function again.

It will (I think) currently pick up where we left off, but we have
no way of knowing that as there isn't a 'total records' count or
any other form of index in the output payload.

So, software solutions I think should work (though may warrant a note
to be added to the spec).

1. Read whole thing twice. First time is just to ensure we get
   to the end and flush out any prior half done reads.
2. Issue a read for a different region (perhaps length 0) first
   and assume everything starts from scratch when we go back to
   this region.

Jonathan



> +
> +out:
> +	kvfree(po);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
> +
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>  {
>  	struct cxl_dev_state *cxlds;
Alison Schofield June 17, 2022, 4:29 p.m. UTC | #16
On Fri, Jun 17, 2022 at 07:05:08AM -0700, Jonathan Cameron wrote:
> On Tue, 14 Jun 2022 17:10:27 -0700
> alison.schofield@intel.com wrote:
> 
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices that support persistent memory maintain a list of locations
> > that are poisoned or result in poison if the addresses are accessed by
> > the host.
> > 
> > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> > list as a set of  Media Error Records that include the source of the
> > error, the starting device physical address and length. The length is
> > the number of adjacent DPAs in the record and is in units of 64 bytes.
> > 
> > Retrieve the list and log each Media Error Record as a trace event of
> > type cxl_poison_list.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> 
> A few more things inline.
> 
> Otherwise, can confirm it works with some hack QEMU code.
> I'll tidy that up and post soon.
> 
> > +int cxl_mem_get_poison_list(struct device *dev)
> > +{
snip
> > +
> > +			trace_cxl_poison_list(dev, source, addr, len);
> 
> Need to mask off the lower 6 bits of addr as they contain the source
> + a few reserved bits.
> 
> I was confused how you were geting better than 64 byte precision in your
> example.
>
Ah...got it. Thanks!

> > +		}
> > +
> > +		/* Protect against an uncleared _FLAG_MORE */
> > +		nr_records = nr_records + le16_to_cpu(po->count);
> > +		if (nr_records >= cxlds->poison_max)
> > +			goto out;
> > +
> > +	} while (po->flags & CXL_POISON_FLAG_MORE);
> So.. A conundrum here.  What happens if:
> 
> 1. We get an error mid way through a set of multiple reads
>    (something intermittent - maybe a software issue)
> 2. We will drop out of here fine and report the error.
> 3. We run this function again.
> 
> It will (I think) currently pick up where we left off, but we have
> no way of knowing that as there isn't a 'total records' count or
> any other form of index in the output payload.

Yes. That is sad. I'm assume it's by design and CXL devices never
intended to keep any totals.

> 
> So, software solutions I think should work (though may warrant a note
> to be added to the spec).
> 
> 1. Read whole thing twice. First time is just to ensure we get
>    to the end and flush out any prior half done reads.
> 2. Issue a read for a different region (perhaps length 0) first
>    and assume everything starts from scratch when we go back to
>    this region.

Can you tell me more about 2 ?

Also, Since posting this I have added protection to this path to ensure
only one reader of the poison list for this device. Like this:

if (!completion_done(&cxlds->read_poison_complete);
              return -EBUSY;
wait_for_completion_interruptible(&cxlds->read_poison_complete);
	...GET ALL THE POISON...
complete(&cxlds->read_poison_complete);

And will add the error message on that unexpected _FLAG_MORE too.

Alison
> 
> Jonathan
> 



> > +
> > +out:
> > +	kvfree(po);
> > +	return rc;
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
> > +
> >  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
> >  {
> >  	struct cxl_dev_state *cxlds;
>
Davidlohr Bueso June 17, 2022, 5:29 p.m. UTC | #17
On Fri, 17 Jun 2022, Alison Schofield wrote:

>On Fri, Jun 17, 2022 at 07:05:08AM -0700, Jonathan Cameron wrote:
>> On Tue, 14 Jun 2022 17:10:27 -0700
>> alison.schofield@intel.com wrote:
>>
>> > From: Alison Schofield <alison.schofield@intel.com>
>> >
>> > CXL devices that support persistent memory maintain a list of locations
>> > that are poisoned or result in poison if the addresses are accessed by
>> > the host.
>> >
>> > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
>> > list as a set of  Media Error Records that include the source of the
>> > error, the starting device physical address and length. The length is
>> > the number of adjacent DPAs in the record and is in units of 64 bytes.
>> >
>> > Retrieve the list and log each Media Error Record as a trace event of
>> > type cxl_poison_list.
>> >
>> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
>>
>> A few more things inline.
>>
>> Otherwise, can confirm it works with some hack QEMU code.
>> I'll tidy that up and post soon.
>>
>> > +int cxl_mem_get_poison_list(struct device *dev)
>> > +{
>snip
>> > +
>> > +			trace_cxl_poison_list(dev, source, addr, len);
>>
>> Need to mask off the lower 6 bits of addr as they contain the source
>> + a few reserved bits.
>>
>> I was confused how you were geting better than 64 byte precision in your
>> example.
>>
>Ah...got it. Thanks!
>
>> > +		}
>> > +
>> > +		/* Protect against an uncleared _FLAG_MORE */
>> > +		nr_records = nr_records + le16_to_cpu(po->count);
>> > +		if (nr_records >= cxlds->poison_max)
>> > +			goto out;
>> > +
>> > +	} while (po->flags & CXL_POISON_FLAG_MORE);
>> So.. A conundrum here.  What happens if:
>>
>> 1. We get an error mid way through a set of multiple reads
>>    (something intermittent - maybe a software issue)
>> 2. We will drop out of here fine and report the error.
>> 3. We run this function again.
>>
>> It will (I think) currently pick up where we left off, but we have
>> no way of knowing that as there isn't a 'total records' count or
>> any other form of index in the output payload.
>
>Yes. That is sad. I'm assume it's by design and CXL devices never
>intended to keep any totals.
>
>>
>> So, software solutions I think should work (though may warrant a note
>> to be added to the spec).
>>
>> 1. Read whole thing twice. First time is just to ensure we get
>>    to the end and flush out any prior half done reads.
>> 2. Issue a read for a different region (perhaps length 0) first
>>    and assume everything starts from scratch when we go back to
>>    this region.
>
>Can you tell me more about 2 ?
>
>Also, Since posting this I have added protection to this path to ensure
>only one reader of the poison list for this device. Like this:

I don't think we should prevent multiple list reads. I would expect the
scenario Jonathan describes to be the uncommon case.

Thanks,
Davidlohr

>
>if (!completion_done(&cxlds->read_poison_complete);
>              return -EBUSY;
>wait_for_completion_interruptible(&cxlds->read_poison_complete);
>	...GET ALL THE POISON...
>complete(&cxlds->read_poison_complete);
>
>And will add the error message on that unexpected _FLAG_MORE too.
>
>Alison
>>
>> Jonathan
>>
>
>
>
>> > +
>> > +out:
>> > +	kvfree(po);
>> > +	return rc;
>> > +}
>> > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
>> > +
>> >  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>> >  {
>> >	struct cxl_dev_state *cxlds;
>>
Dan Williams June 17, 2022, 6:26 p.m. UTC | #18
alison.schofield@ wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices that support persistent memory maintain a list of locations
> that are poisoned or result in poison if the addresses are accessed by
> the host.
> 
> Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> list as a set of  Media Error Records that include the source of the
> error, the starting device physical address and length. The length is
> the number of adjacent DPAs in the record and is in units of 64 bytes.
> 
> Retrieve the list and log each Media Error Record as a trace event of
> type cxl_poison_list.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/cxl/cxlmem.h    | 43 +++++++++++++++++++++++
>  drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 118 insertions(+)
> 
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 60d10ee1e7fc..29cf0459b44a 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info {
>   *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
>   * @lsa_size: Size of Label Storage Area
>   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
> + * @poison_max_mer: maximum Media Error Records tracked in Poison List
>   * @mbox_mutex: Mutex to synchronize mailbox access.
>   * @firmware_version: Firmware version for the memory device.
>   * @enabled_cmds: Hardware commands found enabled in CEL.
> @@ -204,6 +205,7 @@ struct cxl_dev_state {
>  
>  	size_t payload_size;
>  	size_t lsa_size;
> +	u32 poison_max;
>  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>  	char firmware_version[0x10];
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> @@ -317,6 +319,46 @@ struct cxl_mbox_set_partition_info {
>  
>  #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
>  
> +struct cxl_mbox_poison_payload_in {
> +	__le64 offset;
> +	__le64 length;
> +} __packed;
> +
> +struct cxl_mbox_poison_payload_out {
> +	u8 flags;
> +	u8 rsvd1;
> +	__le64 overflow_timestamp;
> +	__le16 count;
> +	u8 rsvd2[0x14];
> +	struct cxl_poison_record {
> +		__le64 address;
> +		__le32 length;
> +		__le32 rsvd;
> +	} __packed record[];
> +} __packed;
> +
> +/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */
> +#define CXL_POISON_FLAG_MORE            BIT(0)
> +#define CXL_POISON_FLAG_OVERFLOW        BIT(1)
> +#define CXL_POISON_FLAG_SCANNING        BIT(2)
> +
> +/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */
> +#define CXL_POISON_SOURCE_MASK		GENMASK(2, 0)
> +#define	CXL_POISON_SOURCE_UNKNOWN	0
> +#define	CXL_POISON_SOURCE_EXTERNAL	1
> +#define	CXL_POISON_SOURCE_INTERNAL	2
> +#define	CXL_POISON_SOURCE_INJECTED	3
> +#define	CXL_POISON_SOURCE_VENDOR	7
> +
> +/* Software define */
> +#define	CXL_POISON_SOURCE_INVALID	99
> +#define CXL_POISON_SOURCE_VALID(x)		\
> +	(((x) == CXL_POISON_SOURCE_UNKNOWN)  ||	\
> +	 ((x) == CXL_POISON_SOURCE_EXTERNAL) ||	\
> +	 ((x) == CXL_POISON_SOURCE_INTERNAL) ||	\
> +	 ((x) == CXL_POISON_SOURCE_INJECTED) ||	\
> +	 ((x) == CXL_POISON_SOURCE_VENDOR))
> +
>  /**
>   * struct cxl_mem_command - Driver representation of a memory device command
>   * @info: Command information as it exists for the UAPI
> @@ -351,6 +393,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
>  void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
>  void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> +int cxl_mem_get_poison_list(struct device *dev);
>  #ifdef CONFIG_CXL_SUSPEND
>  void cxl_mem_active_inc(void);
>  void cxl_mem_active_dec(void);
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 54f434733b56..c10c7020ebc2 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -9,6 +9,9 @@
>  
>  #include "core.h"
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/cxl.h>
> +
>  static bool cxl_raw_allow_all;
>  
>  /**
> @@ -755,6 +758,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>  {
>  	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
>  	struct cxl_mbox_identify id;
> +	__le32 val = 0;
>  	int rc;
>  
>  	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
> @@ -783,6 +787,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>  	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
>  	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
>  
> +	memcpy(&val, id.poison_list_max_mer, 3);
> +	cxlds->poison_max = le32_to_cpu(val);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
> @@ -826,6 +833,74 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
>  
> +int cxl_mem_get_poison_list(struct device *dev)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct cxl_mbox_poison_payload_out *po;
> +	struct cxl_mbox_poison_payload_in pi;
> +	int nr_records = 0;
> +	int rc, i;
> +
> +	if (range_len(&cxlds->pmem_range)) {
> +		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> +		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
> +	} else {
> +		return -ENXIO;
> +	}
> +
> +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> +	if (!po)
> +		return -ENOMEM;
> +
> +	do {
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> +				       sizeof(pi), po, cxlds->payload_size);
> +		if (rc)
> +			goto out;

In this case, no need for a 'goto' when 'break' will do.

> +
> +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> +
> +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> +				&o_time);

This should be just another event type, not an error message.

> +			rc = -ENXIO;

No need to error out, the media error records are still valid.

> +			goto out;
> +		}
> +
> +		if (po->flags & CXL_POISON_FLAG_SCANNING) {
> +			dev_err(dev, "Scan Media in Progress\n");

This isn't an error condition and it should be something the kernel is
mediating in the first instance. For example if userspace did:

echo 1 > trace_poison &
echo 1 > trace_poison_cached

I would expect the second echo to just block awaiting the completion of
the scan media request. I.e. just prevent the possibility of these
commands colliding outside of CONFIG_CXL_MEM_RAW_COMMANDS=y shenanigans,
in which case userspace gets to keep the pieces.

> +			rc = -EBUSY;
> +			goto out;
> +		}
> +
> +		for (i = 0; i < le16_to_cpu(po->count); i++) {
> +			u64 addr = le64_to_cpu(po->record[i].address);
> +			u32 len = le32_to_cpu(po->record[i].length);
> +			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
> +
> +			if (!CXL_POISON_SOURCE_VALID(source)) {
> +				dev_dbg(dev, "Invalid poison source %d",
> +					source);

Per above, just emit the raw field value and leave parsing values to
userspace.

> +				source = CXL_POISON_SOURCE_INVALID;
> +			}
> +
> +			trace_cxl_poison_list(dev, source, addr, len);
> +		}
> +
> +		/* Protect against an uncleared _FLAG_MORE */
> +		nr_records = nr_records + le16_to_cpu(po->count);
> +		if (nr_records >= cxlds->poison_max)
> +			goto out;

This also feels like something that should have a Linux max as well,
something like:

"cxlds->poison_max = min(identify->poison_max, CXL_LIST_POISON_MAX)"

...where CXL_LIST_POISON_MAX is a "reasonable" maximum for a cache of
hardware media poison errors like 1024.

> +
> +	} while (po->flags & CXL_POISON_FLAG_MORE);
> +
> +out:
> +	kvfree(po);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
> +
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>  {
>  	struct cxl_dev_state *cxlds;
> -- 
> 2.31.1
>
Dan Williams June 17, 2022, 7:02 p.m. UTC | #19
Alison Schofield wrote:
> On Thu, Jun 16, 2022 at 12:43:34PM -0700, Davidlohr Bueso wrote:
> > On Tue, 14 Jun 2022, alison.schofield@intel.com wrote:
> > 
> > >From: Alison Schofield <alison.schofield@intel.com>
> > >
> > >CXL devices that support persistent memory maintain a list of locations
> > >that are poisoned or result in poison if the addresses are accessed by
> > >the host.
> > >
> > >Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> > >list as a set of  Media Error Records that include the source of the
> > >error, the starting device physical address and length. The length is
> > >the number of adjacent DPAs in the record and is in units of 64 bytes.
> > >
> > >Retrieve the list and log each Media Error Record as a trace event of
> > >type cxl_poison_list.
> > >
> > >Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > >---
> > > drivers/cxl/cxlmem.h    | 43 +++++++++++++++++++++++
> > > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++
> > > 2 files changed, 118 insertions(+)
> > >
> snip
> 
> > >+int cxl_mem_get_poison_list(struct device *dev)
> > >+{
> > >+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > >+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > >+	struct cxl_mbox_poison_payload_out *po;
> > >+	struct cxl_mbox_poison_payload_in pi;
> > >+	int nr_records = 0;
> > >+	int rc, i;
> > >+
> > >+	if (range_len(&cxlds->pmem_range)) {
> > >+		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> > >+		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
> 
> First off - you stopped at a bug here - that pi.length needs to be
> in units of 64 bytes.
> > 
> > Do you ever see this changing to not always use the full pmem DPA range
> > but allow arbitrary ones? I also assume this is the reason why you don't
> > check the range vs cxlds->ram_range to prevent any overlaps, no?
> > 
> > Thanks,
> > Davidlohr
> 
> David - Great question!
> 
> I'm headed in this direction -
> 
> cxl list --media-errors -m mem1
> 	lists media errors for requested memdev
> 
> cxl list --media-errors -r region#
> 	lists region errors with HPA addresses
> 	(So here cxl tool will collect the poison for all the regions
> 	 memdevs and do the DPA to HPA translation)
> 
> To answer your question, I wasn't thinking of limiting
> the range within the memdev, but certainly could. And if we were
> taking in ranges, those ranges would need to be checked.
> 
> $cxl list --media-errors -m mem1 --range-start=  --range-end|len=
> 
> Now, if I left the sysfs inteface as is, the driver will read the 
> entire poison list for the memdev and then cxl tool will filter it
> for the range requested. 
> 
> Or, maybe we should implement in libcxl (not sysfs), with memdev and
> range options and only collect from the device the range requested.
> 
> Either one looks the same to the cxl tool user, but limiting the
> range we send to the device would certainly cut down on unwanted
> records being logged, retrieved, and examined.
> 
> I'd like to hear more from you and other community members.

There is some history here that is relevant to this design. CXL Get
Poison List builds on lessons learned from the ACPI "Address Range
Scrub" mechanism that was deployed for ACPI described persistent memory
platform (See ACPI 6.4 9.20.7.2 "Address Range Scrubbing (ARS)
Overview"). In that case there was no expectation that the device
maintained a cached and coherent (with incoming poison writes) copy of
the media error list. CXL Get Poison List in comparison is meant to
obviate the need to perform Scan Media in most scenarios, and it is
lightweight enough that userspace need not have a mechanism to request
errors by range, in my opinion.

One of the design warts of drivers/acpi/nfit/ that I want to get away
from in the case of drivers/cxl/ is snooping the equivalent of ARS
command results to populate a kernel list of poison addresses and
instead put the onus on userspace to collect DPA events and optionally
inform the kernel of the HPA impacts. For example, DAX filesystems will
soon have the ability to do something useful with poison notifications
[1], but that support is limited to synchronously consumed poison
flagged by memory_failure(). When the cxl tool translates the poison
list to HPA it needs an ABI to turn around and notify the filesystem
about which blocks got clobbered.

[1]: https://lore.kernel.org/all/20220616193157.2c2e963f3e7e38dfac554a28@linux-foundation.org/
Dan Williams June 17, 2022, 7:27 p.m. UTC | #20
Jonathan Cameron wrote:
> On Tue, 14 Jun 2022 17:10:27 -0700
> alison.schofield@intel.com wrote:
> 
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices that support persistent memory maintain a list of locations
> > that are poisoned or result in poison if the addresses are accessed by
> > the host.
> > 
> > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> > list as a set of  Media Error Records that include the source of the
> > error, the starting device physical address and length. The length is
> > the number of adjacent DPAs in the record and is in units of 64 bytes.
> > 
> > Retrieve the list and log each Media Error Record as a trace event of
> > type cxl_poison_list.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> 
> A few more things inline.
> 
> Otherwise, can confirm it works with some hack QEMU code.
> I'll tidy that up and post soon.
> 
> > +int cxl_mem_get_poison_list(struct device *dev)
> > +{
> > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +	struct cxl_mbox_poison_payload_out *po;
> > +	struct cxl_mbox_poison_payload_in pi;
> > +	int nr_records = 0;
> > +	int rc, i;
> > +
> > +	if (range_len(&cxlds->pmem_range)) {
> > +		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> > +		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
> > +	} else {
> > +		return -ENXIO;
> > +	}
> > +
> > +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> > +	if (!po)
> > +		return -ENOMEM;
> > +
> > +	do {
> > +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> > +				       sizeof(pi), po, cxlds->payload_size);
> > +		if (rc)
> > +			goto out;
> > +
> > +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> > +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> > +
> > +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> > +				&o_time);
> > +			rc = -ENXIO;
> > +			goto out;
> > +		}
> > +
> > +		if (po->flags & CXL_POISON_FLAG_SCANNING) {
> > +			dev_err(dev, "Scan Media in Progress\n");
> > +			rc = -EBUSY;
> > +			goto out;
> > +		}
> > +
> > +		for (i = 0; i < le16_to_cpu(po->count); i++) {
> > +			u64 addr = le64_to_cpu(po->record[i].address);
> 
> > +			u32 len = le32_to_cpu(po->record[i].length);
> 
> 
> > +			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
> > +
> > +			if (!CXL_POISON_SOURCE_VALID(source)) {
> > +				dev_dbg(dev, "Invalid poison source %d",
> > +					source);
> > +				source = CXL_POISON_SOURCE_INVALID;
> > +			}
> > +
> > +			trace_cxl_poison_list(dev, source, addr, len);
> 
> Need to mask off the lower 6 bits of addr as they contain the source
> + a few reserved bits.
> 
> I was confused how you were geting better than 64 byte precision in your
> example.
> 
> > +		}
> > +
> > +		/* Protect against an uncleared _FLAG_MORE */
> > +		nr_records = nr_records + le16_to_cpu(po->count);
> > +		if (nr_records >= cxlds->poison_max)
> > +			goto out;
> > +
> > +	} while (po->flags & CXL_POISON_FLAG_MORE);
> So.. A conundrum here.  What happens if:
> 
> 1. We get an error mid way through a set of multiple reads
>    (something intermittent - maybe a software issue)
> 2. We will drop out of here fine and report the error.
> 3. We run this function again.
> 
> It will (I think) currently pick up where we left off, but we have
> no way of knowing that as there isn't a 'total records' count or
> any other form of index in the output payload.
> 
> So, software solutions I think should work (though may warrant a note
> to be added to the spec).
> 
> 1. Read whole thing twice. First time is just to ensure we get
>    to the end and flush out any prior half done reads.
> 2. Issue a read for a different region (perhaps length 0) first
>    and assume everything starts from scratch when we go back to
>    this region.

It would be nice if this was codified as *the* way to reset the
retrieval, but I worry that neither length==0 or length==1 can be used
for this purpose since the "more" bit is attached to the last passed in
*range*, not request. I.e. spec seems to allow for overlapping
retrievals, although I doubt any implementation gets that fancy.

I think it is sufficient to just include the "more" flag in the trace
event and if userspace suspects that it is getting "more" results from a
previous run it can reissue the scan. This is another reason that the
trace event should include the pid of the process that triggered the
results so it can delineate re-requests. Otherwise, the poison list
cache is opportunistic so I am not sure that missing records in this
corner case is fatal.
Dan Williams June 17, 2022, 7:32 p.m. UTC | #21
Alison Schofield wrote:
> On Fri, Jun 17, 2022 at 07:05:08AM -0700, Jonathan Cameron wrote:
> > On Tue, 14 Jun 2022 17:10:27 -0700
> > alison.schofield@intel.com wrote:
> > 
> > > From: Alison Schofield <alison.schofield@intel.com>
> > > 
> > > CXL devices that support persistent memory maintain a list of locations
> > > that are poisoned or result in poison if the addresses are accessed by
> > > the host.
> > > 
> > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> > > list as a set of  Media Error Records that include the source of the
> > > error, the starting device physical address and length. The length is
> > > the number of adjacent DPAs in the record and is in units of 64 bytes.
> > > 
> > > Retrieve the list and log each Media Error Record as a trace event of
> > > type cxl_poison_list.
> > > 
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > 
> > A few more things inline.
> > 
> > Otherwise, can confirm it works with some hack QEMU code.
> > I'll tidy that up and post soon.
> > 
> > > +int cxl_mem_get_poison_list(struct device *dev)
> > > +{
> snip
> > > +
> > > +			trace_cxl_poison_list(dev, source, addr, len);
> > 
> > Need to mask off the lower 6 bits of addr as they contain the source
> > + a few reserved bits.
> > 
> > I was confused how you were geting better than 64 byte precision in your
> > example.
> >
> Ah...got it. Thanks!
> 
> > > +		}
> > > +
> > > +		/* Protect against an uncleared _FLAG_MORE */
> > > +		nr_records = nr_records + le16_to_cpu(po->count);
> > > +		if (nr_records >= cxlds->poison_max)
> > > +			goto out;
> > > +
> > > +	} while (po->flags & CXL_POISON_FLAG_MORE);
> > So.. A conundrum here.  What happens if:
> > 
> > 1. We get an error mid way through a set of multiple reads
> >    (something intermittent - maybe a software issue)
> > 2. We will drop out of here fine and report the error.
> > 3. We run this function again.
> > 
> > It will (I think) currently pick up where we left off, but we have
> > no way of knowing that as there isn't a 'total records' count or
> > any other form of index in the output payload.
> 
> Yes. That is sad. I'm assume it's by design and CXL devices never
> intended to keep any totals.
> 
> > 
> > So, software solutions I think should work (though may warrant a note
> > to be added to the spec).
> > 
> > 1. Read whole thing twice. First time is just to ensure we get
> >    to the end and flush out any prior half done reads.
> > 2. Issue a read for a different region (perhaps length 0) first
> >    and assume everything starts from scratch when we go back to
> >    this region.
> 
> Can you tell me more about 2 ?
> 
> Also, Since posting this I have added protection to this path to ensure
> only one reader of the poison list for this device. Like this:
> 
> if (!completion_done(&cxlds->read_poison_complete);
>               return -EBUSY;
> wait_for_completion_interruptible(&cxlds->read_poison_complete);
> 	...GET ALL THE POISON...
> complete(&cxlds->read_poison_complete);

Since this runs in the context of the requester a completion feels out
of place. What this probably wants is a mutex() protecting the state
machine of the Media Error Record retrieval and the "more" flag.

> 
> And will add the error message on that unexpected _FLAG_MORE too.
> 
> Alison
> > 
> > Jonathan
> > 
> 
> 
> 
> > > +
> > > +out:
> > > +	kvfree(po);
> > > +	return rc;
> > > +}
> > > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
> > > +
> > >  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
> > >  {
> > >  	struct cxl_dev_state *cxlds;
> >
Jonathan Cameron June 20, 2022, 10:53 a.m. UTC | #22
On Fri, 17 Jun 2022 12:02:32 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Alison Schofield wrote:
> > On Thu, Jun 16, 2022 at 12:43:34PM -0700, Davidlohr Bueso wrote:  
> > > On Tue, 14 Jun 2022, alison.schofield@intel.com wrote:
> > >   
> > > >From: Alison Schofield <alison.schofield@intel.com>
> > > >
> > > >CXL devices that support persistent memory maintain a list of locations
> > > >that are poisoned or result in poison if the addresses are accessed by
> > > >the host.
> > > >
> > > >Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> > > >list as a set of  Media Error Records that include the source of the
> > > >error, the starting device physical address and length. The length is
> > > >the number of adjacent DPAs in the record and is in units of 64 bytes.
> > > >
> > > >Retrieve the list and log each Media Error Record as a trace event of
> > > >type cxl_poison_list.
> > > >
> > > >Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > >---
> > > > drivers/cxl/cxlmem.h    | 43 +++++++++++++++++++++++
> > > > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++
> > > > 2 files changed, 118 insertions(+)
> > > >  
> > snip
> >   
> > > >+int cxl_mem_get_poison_list(struct device *dev)
> > > >+{
> > > >+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > > >+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > > >+	struct cxl_mbox_poison_payload_out *po;
> > > >+	struct cxl_mbox_poison_payload_in pi;
> > > >+	int nr_records = 0;
> > > >+	int rc, i;
> > > >+
> > > >+	if (range_len(&cxlds->pmem_range)) {
> > > >+		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> > > >+		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));  
> > 
> > First off - you stopped at a bug here - that pi.length needs to be
> > in units of 64 bytes.  
> > > 
> > > Do you ever see this changing to not always use the full pmem DPA range
> > > but allow arbitrary ones? I also assume this is the reason why you don't
> > > check the range vs cxlds->ram_range to prevent any overlaps, no?
> > > 
> > > Thanks,
> > > Davidlohr  
> > 
> > David - Great question!
> > 
> > I'm headed in this direction -
> > 
> > cxl list --media-errors -m mem1
> > 	lists media errors for requested memdev
> > 
> > cxl list --media-errors -r region#
> > 	lists region errors with HPA addresses
> > 	(So here cxl tool will collect the poison for all the regions
> > 	 memdevs and do the DPA to HPA translation)
> > 
> > To answer your question, I wasn't thinking of limiting
> > the range within the memdev, but certainly could. And if we were
> > taking in ranges, those ranges would need to be checked.
> > 
> > $cxl list --media-errors -m mem1 --range-start=  --range-end|len=
> > 
> > Now, if I left the sysfs inteface as is, the driver will read the 
> > entire poison list for the memdev and then cxl tool will filter it
> > for the range requested. 
> > 
> > Or, maybe we should implement in libcxl (not sysfs), with memdev and
> > range options and only collect from the device the range requested.
> > 
> > Either one looks the same to the cxl tool user, but limiting the
> > range we send to the device would certainly cut down on unwanted
> > records being logged, retrieved, and examined.
> > 
> > I'd like to hear more from you and other community members.  
> 
> There is some history here that is relevant to this design. CXL Get
> Poison List builds on lessons learned from the ACPI "Address Range
> Scrub" mechanism that was deployed for ACPI described persistent memory
> platform (See ACPI 6.4 9.20.7.2 "Address Range Scrubbing (ARS)
> Overview"). In that case there was no expectation that the device
> maintained a cached and coherent (with incoming poison writes) copy of
> the media error list. CXL Get Poison List in comparison is meant to
> obviate the need to perform Scan Media in most scenarios, and it is
> lightweight enough that userspace need not have a mechanism to request
> errors by range, in my opinion.
> 
> One of the design warts of drivers/acpi/nfit/ that I want to get away
> from in the case of drivers/cxl/ is snooping the equivalent of ARS
> command results to populate a kernel list of poison addresses and
> instead put the onus on userspace to collect DPA events and optionally
> inform the kernel of the HPA impacts.

Can you give more info on why such an in kernel flow doesn't work
well for this usecase (particularly for volatile memory)? I don't
much like the flow differing from how we do DRAM patrol scrub based
handling today. I'm not sure I like the requirement for userspace
to be in the loop if we are dealing with volatile failures even if
the detection is async.

> For example, DAX filesystems will
> soon have the ability to do something useful with poison notifications
> [1], but that support is limited to synchronously consumed poison
> flagged by memory_failure(). When the cxl tool translates the poison
> list to HPA it needs an ABI to turn around and notify the filesystem
> about which blocks got clobbered.

I'm not clear why it makes sense to do the DPA to HPA conversion in
userspace. It should be a fairly trivial bit of code to do the reverse look
up in kernel and then we just bolt into existing infrastructure.

Jonathan

> 
> [1]: https://lore.kernel.org/all/20220616193157.2c2e963f3e7e38dfac554a28@linux-foundation.org/
Jonathan Cameron June 20, 2022, 10:56 a.m. UTC | #23
On Fri, 17 Jun 2022 09:29:35 -0700
Alison Schofield <alison.schofield@intel.com> wrote:

> On Fri, Jun 17, 2022 at 07:05:08AM -0700, Jonathan Cameron wrote:
> > On Tue, 14 Jun 2022 17:10:27 -0700
> > alison.schofield@intel.com wrote:
> >   
> > > From: Alison Schofield <alison.schofield@intel.com>
> > > 
> > > CXL devices that support persistent memory maintain a list of locations
> > > that are poisoned or result in poison if the addresses are accessed by
> > > the host.
> > > 
> > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> > > list as a set of  Media Error Records that include the source of the
> > > error, the starting device physical address and length. The length is
> > > the number of adjacent DPAs in the record and is in units of 64 bytes.
> > > 
> > > Retrieve the list and log each Media Error Record as a trace event of
> > > type cxl_poison_list.
> > > 
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>  
> > 
> > A few more things inline.
> > 
> > Otherwise, can confirm it works with some hack QEMU code.
> > I'll tidy that up and post soon.
> >   
> > > +int cxl_mem_get_poison_list(struct device *dev)
> > > +{  
> snip
> > > +
> > > +			trace_cxl_poison_list(dev, source, addr, len);  
> > 
> > Need to mask off the lower 6 bits of addr as they contain the source
> > + a few reserved bits.
> > 
> > I was confused how you were geting better than 64 byte precision in your
> > example.
> >  
> Ah...got it. Thanks!
> 
> > > +		}
> > > +
> > > +		/* Protect against an uncleared _FLAG_MORE */
> > > +		nr_records = nr_records + le16_to_cpu(po->count);
> > > +		if (nr_records >= cxlds->poison_max)
> > > +			goto out;
> > > +
> > > +	} while (po->flags & CXL_POISON_FLAG_MORE);  
> > So.. A conundrum here.  What happens if:
> > 
> > 1. We get an error mid way through a set of multiple reads
> >    (something intermittent - maybe a software issue)
> > 2. We will drop out of here fine and report the error.
> > 3. We run this function again.
> > 
> > It will (I think) currently pick up where we left off, but we have
> > no way of knowing that as there isn't a 'total records' count or
> > any other form of index in the output payload.  
> 
> Yes. That is sad. I'm assume it's by design and CXL devices never
> intended to keep any totals.
> 
> > 
> > So, software solutions I think should work (though may warrant a note
> > to be added to the spec).
> > 
> > 1. Read whole thing twice. First time is just to ensure we get
> >    to the end and flush out any prior half done reads.
> > 2. Issue a read for a different region (perhaps length 0) first
> >    and assume everything starts from scratch when we go back to
> >    this region.  
> 
> Can you tell me more about 2 ?

2 relies on interpreting what the spec says for an unusual corner case.
The concept of 'more available' is something I would assume you'd only
get if you issued a repeat of the same request.  I don't think the
spec actually covers this case - perhaps that's something we need to
raise via the appropriate channels.

Jonathan



> 
> Also, Since posting this I have added protection to this path to ensure
> only one reader of the poison list for this device. Like this:
> 
> if (!completion_done(&cxlds->read_poison_complete);
>               return -EBUSY;
> wait_for_completion_interruptible(&cxlds->read_poison_complete);
> 	...GET ALL THE POISON...
> complete(&cxlds->read_poison_complete);
> 
> And will add the error message on that unexpected _FLAG_MORE too.
> 
> Alison
> > 
> > Jonathan
> >   
> 
> 
> 
> > > +
> > > +out:
> > > +	kvfree(po);
> > > +	return rc;
> > > +}
> > > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
> > > +
> > >  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
> > >  {
> > >  	struct cxl_dev_state *cxlds;  
> >
Jonathan Cameron June 20, 2022, 11:30 a.m. UTC | #24
On Fri, 17 Jun 2022 12:27:38 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Jonathan Cameron wrote:
> > On Tue, 14 Jun 2022 17:10:27 -0700
> > alison.schofield@intel.com wrote:
> >   
> > > From: Alison Schofield <alison.schofield@intel.com>
> > > 
> > > CXL devices that support persistent memory maintain a list of locations
> > > that are poisoned or result in poison if the addresses are accessed by
> > > the host.
> > > 
> > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
> > > list as a set of  Media Error Records that include the source of the
> > > error, the starting device physical address and length. The length is
> > > the number of adjacent DPAs in the record and is in units of 64 bytes.
> > > 
> > > Retrieve the list and log each Media Error Record as a trace event of
> > > type cxl_poison_list.
> > > 
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>  
> > 
> > A few more things inline.
> > 
> > Otherwise, can confirm it works with some hack QEMU code.
> > I'll tidy that up and post soon.
> >   
> > > +int cxl_mem_get_poison_list(struct device *dev)
> > > +{
> > > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > > +	struct cxl_mbox_poison_payload_out *po;
> > > +	struct cxl_mbox_poison_payload_in pi;
> > > +	int nr_records = 0;
> > > +	int rc, i;
> > > +
> > > +	if (range_len(&cxlds->pmem_range)) {
> > > +		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
> > > +		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
> > > +	} else {
> > > +		return -ENXIO;
> > > +	}
> > > +
> > > +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> > > +	if (!po)
> > > +		return -ENOMEM;
> > > +
> > > +	do {
> > > +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> > > +				       sizeof(pi), po, cxlds->payload_size);
> > > +		if (rc)
> > > +			goto out;
> > > +
> > > +		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
> > > +			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
> > > +
> > > +			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
> > > +				&o_time);
> > > +			rc = -ENXIO;
> > > +			goto out;
> > > +		}
> > > +
> > > +		if (po->flags & CXL_POISON_FLAG_SCANNING) {
> > > +			dev_err(dev, "Scan Media in Progress\n");
> > > +			rc = -EBUSY;
> > > +			goto out;
> > > +		}
> > > +
> > > +		for (i = 0; i < le16_to_cpu(po->count); i++) {
> > > +			u64 addr = le64_to_cpu(po->record[i].address);  
> >   
> > > +			u32 len = le32_to_cpu(po->record[i].length);  
> > 
> >   
> > > +			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
> > > +
> > > +			if (!CXL_POISON_SOURCE_VALID(source)) {
> > > +				dev_dbg(dev, "Invalid poison source %d",
> > > +					source);
> > > +				source = CXL_POISON_SOURCE_INVALID;
> > > +			}
> > > +
> > > +			trace_cxl_poison_list(dev, source, addr, len);  
> > 
> > Need to mask off the lower 6 bits of addr as they contain the source
> > + a few reserved bits.
> > 
> > I was confused how you were geting better than 64 byte precision in your
> > example.
> >   
> > > +		}
> > > +
> > > +		/* Protect against an uncleared _FLAG_MORE */
> > > +		nr_records = nr_records + le16_to_cpu(po->count);
> > > +		if (nr_records >= cxlds->poison_max)
> > > +			goto out;
> > > +
> > > +	} while (po->flags & CXL_POISON_FLAG_MORE);  
> > So.. A conundrum here.  What happens if:
> > 
> > 1. We get an error mid way through a set of multiple reads
> >    (something intermittent - maybe a software issue)
> > 2. We will drop out of here fine and report the error.
> > 3. We run this function again.
> > 
> > It will (I think) currently pick up where we left off, but we have
> > no way of knowing that as there isn't a 'total records' count or
> > any other form of index in the output payload.
> > 
> > So, software solutions I think should work (though may warrant a note
> > to be added to the spec).
> > 
> > 1. Read whole thing twice. First time is just to ensure we get
> >    to the end and flush out any prior half done reads.
> > 2. Issue a read for a different region (perhaps length 0) first
> >    and assume everything starts from scratch when we go back to
> >    this region.  
> 
> It would be nice if this was codified as *the* way to reset the
> retrieval, but I worry that neither length==0 or length==1 can be used
> for this purpose since the "more" bit is attached to the last passed in
> *range*, not request. I.e. spec seems to allow for overlapping
> retrievals, although I doubt any implementation gets that fancy.
> 
> I think it is sufficient to just include the "more" flag in the trace
> event and if userspace suspects that it is getting "more" results from a
> previous run it can reissue the scan. 

Meaning is a bit ugly if attached to an individual trace event, though
I guess we could do something nicer like only have one that doesn't have
MORE set, thus indicating that one trace event is the last one from a
query.  i.e. fill in MORE for all the other events in the last GET_POISON_LIST
reply. 

> This is another reason that the
> trace event should include the pid of the process that triggered the
> results so it can delineate re-requests. Otherwise, the poison list
> cache is opportunistic so I am not sure that missing records in this
> corner case is fatal.

Ok, for now let's document the limitation with an appropriate comment.
In parallel I've started a thread in appropriate venue to discuss if
we can clarify the spec and potentially do better in future.  So that
discussion should shift over there.

Thanks,

Jonathan
diff mbox series

Patch

diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 60d10ee1e7fc..29cf0459b44a 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -174,6 +174,7 @@  struct cxl_endpoint_dvsec_info {
  *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
  * @lsa_size: Size of Label Storage Area
  *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
+ * @poison_max_mer: maximum Media Error Records tracked in Poison List
  * @mbox_mutex: Mutex to synchronize mailbox access.
  * @firmware_version: Firmware version for the memory device.
  * @enabled_cmds: Hardware commands found enabled in CEL.
@@ -204,6 +205,7 @@  struct cxl_dev_state {
 
 	size_t payload_size;
 	size_t lsa_size;
+	u32 poison_max;
 	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
 	char firmware_version[0x10];
 	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
@@ -317,6 +319,46 @@  struct cxl_mbox_set_partition_info {
 
 #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
 
+struct cxl_mbox_poison_payload_in {
+	__le64 offset;
+	__le64 length;
+} __packed;
+
+struct cxl_mbox_poison_payload_out {
+	u8 flags;
+	u8 rsvd1;
+	__le64 overflow_timestamp;
+	__le16 count;
+	u8 rsvd2[0x14];
+	struct cxl_poison_record {
+		__le64 address;
+		__le32 length;
+		__le32 rsvd;
+	} __packed record[];
+} __packed;
+
+/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */
+#define CXL_POISON_FLAG_MORE            BIT(0)
+#define CXL_POISON_FLAG_OVERFLOW        BIT(1)
+#define CXL_POISON_FLAG_SCANNING        BIT(2)
+
+/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */
+#define CXL_POISON_SOURCE_MASK		GENMASK(2, 0)
+#define	CXL_POISON_SOURCE_UNKNOWN	0
+#define	CXL_POISON_SOURCE_EXTERNAL	1
+#define	CXL_POISON_SOURCE_INTERNAL	2
+#define	CXL_POISON_SOURCE_INJECTED	3
+#define	CXL_POISON_SOURCE_VENDOR	7
+
+/* Software define */
+#define	CXL_POISON_SOURCE_INVALID	99
+#define CXL_POISON_SOURCE_VALID(x)		\
+	(((x) == CXL_POISON_SOURCE_UNKNOWN)  ||	\
+	 ((x) == CXL_POISON_SOURCE_EXTERNAL) ||	\
+	 ((x) == CXL_POISON_SOURCE_INTERNAL) ||	\
+	 ((x) == CXL_POISON_SOURCE_INJECTED) ||	\
+	 ((x) == CXL_POISON_SOURCE_VENDOR))
+
 /**
  * struct cxl_mem_command - Driver representation of a memory device command
  * @info: Command information as it exists for the UAPI
@@ -351,6 +393,7 @@  int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
 struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
 void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
 void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
+int cxl_mem_get_poison_list(struct device *dev);
 #ifdef CONFIG_CXL_SUSPEND
 void cxl_mem_active_inc(void);
 void cxl_mem_active_dec(void);
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 54f434733b56..c10c7020ebc2 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -9,6 +9,9 @@ 
 
 #include "core.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/cxl.h>
+
 static bool cxl_raw_allow_all;
 
 /**
@@ -755,6 +758,7 @@  int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
 {
 	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
 	struct cxl_mbox_identify id;
+	__le32 val = 0;
 	int rc;
 
 	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
@@ -783,6 +787,9 @@  int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
 	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
 	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
 
+	memcpy(&val, id.poison_list_max_mer, 3);
+	cxlds->poison_max = le32_to_cpu(val);
+
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
@@ -826,6 +833,74 @@  int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
 
+int cxl_mem_get_poison_list(struct device *dev)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_mbox_poison_payload_out *po;
+	struct cxl_mbox_poison_payload_in pi;
+	int nr_records = 0;
+	int rc, i;
+
+	if (range_len(&cxlds->pmem_range)) {
+		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
+		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));
+	} else {
+		return -ENXIO;
+	}
+
+	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
+	if (!po)
+		return -ENOMEM;
+
+	do {
+		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
+				       sizeof(pi), po, cxlds->payload_size);
+		if (rc)
+			goto out;
+
+		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
+			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
+
+			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
+				&o_time);
+			rc = -ENXIO;
+			goto out;
+		}
+
+		if (po->flags & CXL_POISON_FLAG_SCANNING) {
+			dev_err(dev, "Scan Media in Progress\n");
+			rc = -EBUSY;
+			goto out;
+		}
+
+		for (i = 0; i < le16_to_cpu(po->count); i++) {
+			u64 addr = le64_to_cpu(po->record[i].address);
+			u32 len = le32_to_cpu(po->record[i].length);
+			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
+
+			if (!CXL_POISON_SOURCE_VALID(source)) {
+				dev_dbg(dev, "Invalid poison source %d",
+					source);
+				source = CXL_POISON_SOURCE_INVALID;
+			}
+
+			trace_cxl_poison_list(dev, source, addr, len);
+		}
+
+		/* Protect against an uncleared _FLAG_MORE */
+		nr_records = nr_records + le16_to_cpu(po->count);
+		if (nr_records >= cxlds->poison_max)
+			goto out;
+
+	} while (po->flags & CXL_POISON_FLAG_MORE);
+
+out:
+	kvfree(po);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
+
 struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
 {
 	struct cxl_dev_state *cxlds;