diff mbox series

[1/5] cxl/memdev: Add support for the Inject Poison mailbox command

Message ID 3c260749c833f51d5cad9ae3912debcdf8b82753.1669781852.git.alison.schofield@intel.com
State Superseded
Headers show
Series cxl: CXL Inject & Clear Poison | expand

Commit Message

Alison Schofield Nov. 30, 2022, 4:34 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

CXL devices optionally support the INJECT POISON mailbox command. Add
a sysfs attribute and memdev driver support for injecting poison.

When a Device Physical Address (DPA) is written to the inject_poison
sysfs attribute send an inject poison command to the device for the
specified address.

Per the CXL Specification (8.2.9.8.4.2), after receiving a valid
inject poison request, the device will return poison when the address
is accessed through the CXL.mem bus. Injecting poison adds the address
to the device's Poison List and the error source is set to injected
error. In addition, the device adds a poison creation event to its
internal Informational Event log, updates the Event Status register,
and if configured, interrupts the host.

Also, per the CXL Specification, it is not an error to inject poison
into an address that already has poison present and no error is returned
from the device. The memdev driver performs basic sanity checking on the
address, however, it does not go as far as reading the poison list to see
if the address is on the list. That discovery is left to the device.

The inject_poison attribute is only visible for devices supporting
the capability.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 Documentation/ABI/testing/sysfs-bus-cxl | 19 +++++++++
 drivers/cxl/core/memdev.c               | 53 +++++++++++++++++++++++++
 drivers/cxl/cxlmem.h                    |  3 ++
 3 files changed, 75 insertions(+)

Comments

Jonathan Cameron Nov. 30, 2022, 2:31 p.m. UTC | #1
On Tue, 29 Nov 2022 20:34:33 -0800
alison.schofield@intel.com wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices optionally support the INJECT POISON mailbox command. Add
> a sysfs attribute and memdev driver support for injecting poison.
> 
> When a Device Physical Address (DPA) is written to the inject_poison
> sysfs attribute send an inject poison command to the device for the
> specified address.
> 
> Per the CXL Specification (8.2.9.8.4.2), after receiving a valid
> inject poison request, the device will return poison when the address
> is accessed through the CXL.mem bus. Injecting poison adds the address
> to the device's Poison List and the error source is set to injected
> error. In addition, the device adds a poison creation event to its
> internal Informational Event log, updates the Event Status register,
> and if configured, interrupts the host.
> 
> Also, per the CXL Specification, it is not an error to inject poison
> into an address that already has poison present and no error is returned
> from the device. The memdev driver performs basic sanity checking on the
> address, however, it does not go as far as reading the poison list to see
> if the address is on the list. That discovery is left to the device.
> 
> The inject_poison attribute is only visible for devices supporting
> the capability.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
A few trivial things inline.  With those fixes LGTM

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> ---
>  Documentation/ABI/testing/sysfs-bus-cxl | 19 +++++++++
>  drivers/cxl/core/memdev.c               | 53 +++++++++++++++++++++++++
>  drivers/cxl/cxlmem.h                    |  3 ++
>  3 files changed, 75 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> index b715a4609718..20db97f7a1aa 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -416,3 +416,22 @@ Description:
>  		if accessed, and the source of the poison. The retrieved
>  		errors are logged as kernel trace events with the label
>  		'cxl_poison'.
> +
> +
> +What:		/sys/bus/cxl/devices/memX/inject_poison
> +Date:		December, 2022
> +KernelVersion:	v6.2
> +Contact:	linux-cxl@vger.kernel.org
> +Description:
> +		(WO) When a Device Physical Address (DPA) is written to this
> +		attribute the memdev driver sends an inject poison command to
> +		the device for the specified address. If successful, the device
> +		returns poison when the address is accessed through the CXL.mem
> +		bus. Injecting poison adds the address to the device's Poison
> +		List and the error source is set to injected error. In addition,
"set to Injected."

perhaps to match spec naming in Media Error Record.

> +		the device adds a poison creation event to its internal
> +		Informational Event log, updates the Event Status register, and
> +		if configured, interrupts the host. It is not an error to inject
> +		poison into an address that already has poison present and no
> +		error is returned. The inject_poison attribute is only visible
> +                for devices supporting the capability.

White space issues (spaces instead of tabs?)

Add something about the masked bits / granularity of addresses that are accepted.

> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index d08b7295a01c..71130813030f 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -142,6 +142,51 @@ static ssize_t trigger_poison_list_store(struct device *dev,
>  }
>  static DEVICE_ATTR_WO(trigger_poison_list);
>  
> +static int cxl_validate_poison_dpa(struct cxl_dev_state *cxlds, u64 dpa)
> +{
> +	if (!resource_size(&cxlds->dpa_res)) {
> +		dev_dbg(cxlds->dev, "device has no dpa resource\n");
> +		return -EINVAL;
> +	}
> +	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
> +		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
> +			dpa, &cxlds->dpa_res);
> +		return -EINVAL;
> +	}
> +	if ((dpa & CXL_POISON_INJECT_RESERVED) != 0) {
> +		dev_dbg(cxlds->dev, "dpa reserve bit(s) [5:0] set 0x%llx\n",
> +			dpa);
> +		return -EINVAL;
> +	}
> +	return 0;
> +}
> +
> +static ssize_t inject_poison_store(struct device *dev,
> +				   struct device_attribute *attr,
> +				   const char *buf, size_t len)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	u64 dpa;
> +	int rc;
> +
> +	rc = kstrtou64(buf, 0, &dpa);
> +	if (rc)
> +		return rc;
> +	rc = cxl_validate_poison_dpa(cxlds, dpa);
> +	if (rc)
> +		return rc;
> +
> +	dpa = cpu_to_le64(dpa);
> +	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_INJECT_POISON, &dpa,

Endianness?

> +			       sizeof(dpa), NULL, cxlds->payload_size);
> +	if (rc)
> +		return rc;
> +
> +	return len;
> +}
> +static DEVICE_ATTR_WO(inject_poison);
> +
>  static struct attribute *cxl_memdev_attributes[] = {
>  	&dev_attr_serial.attr,
>  	&dev_attr_firmware_version.attr,
> @@ -149,6 +194,7 @@ static struct attribute *cxl_memdev_attributes[] = {
>  	&dev_attr_label_storage_size.attr,
>  	&dev_attr_numa_node.attr,
>  	&dev_attr_trigger_poison_list.attr,
> +	&dev_attr_inject_poison.attr,
>  	NULL,
>  };
>  
> @@ -175,6 +221,13 @@ static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
>  			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
>  			return 0;
>  	}
> +	if (a == &dev_attr_inject_poison.attr) {
> +		struct device *dev = kobj_to_dev(kobj);
> +
> +		if (!test_bit(CXL_MEM_COMMAND_ID_INJECT_POISON,
> +			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> +			return 0;
> +	}
>  	return a->mode;
>  }
>  
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 19a9e545ac19..0d4c34be7335 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -396,6 +396,9 @@ struct cxl_mbox_poison_payload_out {
>  #define CXL_POISON_SOURCE_INJECTED	3
>  #define CXL_POISON_SOURCE_VENDOR	7
>  
> +/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
> +#define CXL_POISON_INJECT_RESERVED	GENMASK_ULL(5, 0)
> +
>  /**
>   * struct cxl_mem_command - Driver representation of a memory device command
>   * @info: Command information as it exists for the UAPI
Jonathan Cameron Nov. 30, 2022, 2:40 p.m. UTC | #2
On Wed, 30 Nov 2022 14:31:36 +0000
Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:

> On Tue, 29 Nov 2022 20:34:33 -0800
> alison.schofield@intel.com wrote:
> 
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices optionally support the INJECT POISON mailbox command. Add
> > a sysfs attribute and memdev driver support for injecting poison.
> > 
> > When a Device Physical Address (DPA) is written to the inject_poison
> > sysfs attribute send an inject poison command to the device for the
> > specified address.
> > 
> > Per the CXL Specification (8.2.9.8.4.2), after receiving a valid
> > inject poison request, the device will return poison when the address
> > is accessed through the CXL.mem bus. Injecting poison adds the address
> > to the device's Poison List and the error source is set to injected
> > error. In addition, the device adds a poison creation event to its
> > internal Informational Event log, updates the Event Status register,
> > and if configured, interrupts the host.
> > 
> > Also, per the CXL Specification, it is not an error to inject poison
> > into an address that already has poison present and no error is returned
> > from the device. The memdev driver performs basic sanity checking on the
> > address, however, it does not go as far as reading the poison list to see
> > if the address is on the list. That discovery is left to the device.
> > 
> > The inject_poison attribute is only visible for devices supporting
> > the capability.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>  
> A few trivial things inline.  With those fixes LGTM
> 
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> > ---
> >  Documentation/ABI/testing/sysfs-bus-cxl | 19 +++++++++
> >  drivers/cxl/core/memdev.c               | 53 +++++++++++++++++++++++++
> >  drivers/cxl/cxlmem.h                    |  3 ++
> >  3 files changed, 75 insertions(+)
> > 
> > diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> > index b715a4609718..20db97f7a1aa 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-cxl
> > +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> > @@ -416,3 +416,22 @@ Description:
> >  		if accessed, and the source of the poison. The retrieved
> >  		errors are logged as kernel trace events with the label
> >  		'cxl_poison'.
> > +
> > +
> > +What:		/sys/bus/cxl/devices/memX/inject_poison
> > +Date:		December, 2022
> > +KernelVersion:	v6.2
> > +Contact:	linux-cxl@vger.kernel.org
> > +Description:
> > +		(WO) When a Device Physical Address (DPA) is written to this
> > +		attribute the memdev driver sends an inject poison command to
> > +		the device for the specified address. If successful, the device
> > +		returns poison when the address is accessed through the CXL.mem
> > +		bus. Injecting poison adds the address to the device's Poison
> > +		List and the error source is set to injected error. In addition,  
> "set to Injected."
> 
> perhaps to match spec naming in Media Error Record.
> 
> > +		the device adds a poison creation event to its internal
> > +		Informational Event log, updates the Event Status register, and
> > +		if configured, interrupts the host. It is not an error to inject
> > +		poison into an address that already has poison present and no
> > +		error is returned. The inject_poison attribute is only visible
> > +                for devices supporting the capability.  
> 
> White space issues (spaces instead of tabs?)
> 
> Add something about the masked bits / granularity of addresses that are accepted.
> 
> > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > index d08b7295a01c..71130813030f 100644
> > --- a/drivers/cxl/core/memdev.c
> > +++ b/drivers/cxl/core/memdev.c
> > @@ -142,6 +142,51 @@ static ssize_t trigger_poison_list_store(struct device *dev,
> >  }
> >  static DEVICE_ATTR_WO(trigger_poison_list);
> >  
> > +static int cxl_validate_poison_dpa(struct cxl_dev_state *cxlds, u64 dpa)
> > +{
> > +	if (!resource_size(&cxlds->dpa_res)) {
> > +		dev_dbg(cxlds->dev, "device has no dpa resource\n");
> > +		return -EINVAL;
> > +	}
> > +	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
> > +		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
> > +			dpa, &cxlds->dpa_res);
> > +		return -EINVAL;
> > +	}
> > +	if ((dpa & CXL_POISON_INJECT_RESERVED) != 0) {
> > +		dev_dbg(cxlds->dev, "dpa reserve bit(s) [5:0] set 0x%llx\n",
> > +			dpa);
> > +		return -EINVAL;
> > +	}
> > +	return 0;
> > +}
> > +
> > +static ssize_t inject_poison_store(struct device *dev,
> > +				   struct device_attribute *attr,
> > +				   const char *buf, size_t len)
> > +{
> > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +	u64 dpa;
> > +	int rc;
> > +
> > +	rc = kstrtou64(buf, 0, &dpa);
> > +	if (rc)
> > +		return rc;
> > +	rc = cxl_validate_poison_dpa(cxlds, dpa);
> > +	if (rc)
> > +		return rc;
> > +
> > +	dpa = cpu_to_le64(dpa);
> > +	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_INJECT_POISON, &dpa,  
> 
> Endianness?

Got thrown by the type and missed the cpu_to_le64().  Use a local __le64 so it's all explicit.
One of the static analysis tools will correctly moan about storing it to a u64
(can't remember which).

> 
> > +			       sizeof(dpa), NULL, cxlds->payload_size);
> > +	if (rc)
> > +		return rc;
> > +
> > +	return len;
> > +}
> > +static DEVICE_ATTR_WO(inject_poison);
> > +
> >  static struct attribute *cxl_memdev_attributes[] = {
> >  	&dev_attr_serial.attr,
> >  	&dev_attr_firmware_version.attr,
> > @@ -149,6 +194,7 @@ static struct attribute *cxl_memdev_attributes[] = {
> >  	&dev_attr_label_storage_size.attr,
> >  	&dev_attr_numa_node.attr,
> >  	&dev_attr_trigger_poison_list.attr,
> > +	&dev_attr_inject_poison.attr,
> >  	NULL,
> >  };
> >  
> > @@ -175,6 +221,13 @@ static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
> >  			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> >  			return 0;
> >  	}
> > +	if (a == &dev_attr_inject_poison.attr) {
> > +		struct device *dev = kobj_to_dev(kobj);
> > +
> > +		if (!test_bit(CXL_MEM_COMMAND_ID_INJECT_POISON,
> > +			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> > +			return 0;
> > +	}
> >  	return a->mode;
> >  }
> >  
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 19a9e545ac19..0d4c34be7335 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -396,6 +396,9 @@ struct cxl_mbox_poison_payload_out {
> >  #define CXL_POISON_SOURCE_INJECTED	3
> >  #define CXL_POISON_SOURCE_VENDOR	7
> >  
> > +/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
> > +#define CXL_POISON_INJECT_RESERVED	GENMASK_ULL(5, 0)
> > +
> >  /**
> >   * struct cxl_mem_command - Driver representation of a memory device command
> >   * @info: Command information as it exists for the UAPI  
> 
>
Dave Jiang Dec. 1, 2022, 4:42 p.m. UTC | #3
On 11/30/2022 7:40 AM, Jonathan Cameron wrote:
> On Wed, 30 Nov 2022 14:31:36 +0000
> Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
> 
>> On Tue, 29 Nov 2022 20:34:33 -0800
>> alison.schofield@intel.com wrote:
>>
>>> From: Alison Schofield <alison.schofield@intel.com>
>>>
>>> CXL devices optionally support the INJECT POISON mailbox command. Add
>>> a sysfs attribute and memdev driver support for injecting poison.
>>>
>>> When a Device Physical Address (DPA) is written to the inject_poison
>>> sysfs attribute send an inject poison command to the device for the
>>> specified address.
>>>
>>> Per the CXL Specification (8.2.9.8.4.2), after receiving a valid
>>> inject poison request, the device will return poison when the address
>>> is accessed through the CXL.mem bus. Injecting poison adds the address
>>> to the device's Poison List and the error source is set to injected
>>> error. In addition, the device adds a poison creation event to its
>>> internal Informational Event log, updates the Event Status register,
>>> and if configured, interrupts the host.
>>>
>>> Also, per the CXL Specification, it is not an error to inject poison
>>> into an address that already has poison present and no error is returned
>>> from the device. The memdev driver performs basic sanity checking on the
>>> address, however, it does not go as far as reading the poison list to see
>>> if the address is on the list. That discovery is left to the device.
>>>
>>> The inject_poison attribute is only visible for devices supporting
>>> the capability.
>>>
>>> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
>> A few trivial things inline.  With those fixes LGTM
>>
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>>
>>> ---
>>>   Documentation/ABI/testing/sysfs-bus-cxl | 19 +++++++++
>>>   drivers/cxl/core/memdev.c               | 53 +++++++++++++++++++++++++
>>>   drivers/cxl/cxlmem.h                    |  3 ++
>>>   3 files changed, 75 insertions(+)
>>>
>>> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
>>> index b715a4609718..20db97f7a1aa 100644
>>> --- a/Documentation/ABI/testing/sysfs-bus-cxl
>>> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
>>> @@ -416,3 +416,22 @@ Description:
>>>   		if accessed, and the source of the poison. The retrieved
>>>   		errors are logged as kernel trace events with the label
>>>   		'cxl_poison'.
>>> +
>>> +
>>> +What:		/sys/bus/cxl/devices/memX/inject_poison
>>> +Date:		December, 2022
>>> +KernelVersion:	v6.2
>>> +Contact:	linux-cxl@vger.kernel.org
>>> +Description:
>>> +		(WO) When a Device Physical Address (DPA) is written to this
>>> +		attribute the memdev driver sends an inject poison command to
>>> +		the device for the specified address. If successful, the device
>>> +		returns poison when the address is accessed through the CXL.mem
>>> +		bus. Injecting poison adds the address to the device's Poison
>>> +		List and the error source is set to injected error. In addition,
>> "set to Injected."
>>
>> perhaps to match spec naming in Media Error Record.
>>
>>> +		the device adds a poison creation event to its internal
>>> +		Informational Event log, updates the Event Status register, and
>>> +		if configured, interrupts the host. It is not an error to inject
>>> +		poison into an address that already has poison present and no
>>> +		error is returned. The inject_poison attribute is only visible
>>> +                for devices supporting the capability.
>>
>> White space issues (spaces instead of tabs?)
>>
>> Add something about the masked bits / granularity of addresses that are accepted.
>>
>>> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
>>> index d08b7295a01c..71130813030f 100644
>>> --- a/drivers/cxl/core/memdev.c
>>> +++ b/drivers/cxl/core/memdev.c
>>> @@ -142,6 +142,51 @@ static ssize_t trigger_poison_list_store(struct device *dev,
>>>   }
>>>   static DEVICE_ATTR_WO(trigger_poison_list);
>>>   
>>> +static int cxl_validate_poison_dpa(struct cxl_dev_state *cxlds, u64 dpa)
>>> +{
>>> +	if (!resource_size(&cxlds->dpa_res)) {
>>> +		dev_dbg(cxlds->dev, "device has no dpa resource\n");
>>> +		return -EINVAL;
>>> +	}
>>> +	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
>>> +		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
>>> +			dpa, &cxlds->dpa_res);
>>> +		return -EINVAL;
>>> +	}
>>> +	if ((dpa & CXL_POISON_INJECT_RESERVED) != 0) {
>>> +		dev_dbg(cxlds->dev, "dpa reserve bit(s) [5:0] set 0x%llx\n",
>>> +			dpa);
>>> +		return -EINVAL;
>>> +	}
>>> +	return 0;
>>> +}
>>> +
>>> +static ssize_t inject_poison_store(struct device *dev,
>>> +				   struct device_attribute *attr,
>>> +				   const char *buf, size_t len)
>>> +{
>>> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
>>> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>>> +	u64 dpa;
>>> +	int rc;
>>> +
>>> +	rc = kstrtou64(buf, 0, &dpa);
>>> +	if (rc)
>>> +		return rc;
>>> +	rc = cxl_validate_poison_dpa(cxlds, dpa);
>>> +	if (rc)
>>> +		return rc;
>>> +
>>> +	dpa = cpu_to_le64(dpa);
>>> +	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_INJECT_POISON, &dpa,
>>
>> Endianness?
> 
> Got thrown by the type and missed the cpu_to_le64().  Use a local __le64 so it's all explicit.
> One of the static analysis tools will correctly moan about storing it to a u64
> (can't remember which).

Sparse. I just got yelled at by 0-day for something similar. :)

> 
>>
>>> +			       sizeof(dpa), NULL, cxlds->payload_size);
>>> +	if (rc)
>>> +		return rc;
>>> +
>>> +	return len;
>>> +}
>>> +static DEVICE_ATTR_WO(inject_poison);
>>> +
>>>   static struct attribute *cxl_memdev_attributes[] = {
>>>   	&dev_attr_serial.attr,
>>>   	&dev_attr_firmware_version.attr,
>>> @@ -149,6 +194,7 @@ static struct attribute *cxl_memdev_attributes[] = {
>>>   	&dev_attr_label_storage_size.attr,
>>>   	&dev_attr_numa_node.attr,
>>>   	&dev_attr_trigger_poison_list.attr,
>>> +	&dev_attr_inject_poison.attr,
>>>   	NULL,
>>>   };
>>>   
>>> @@ -175,6 +221,13 @@ static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
>>>   			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
>>>   			return 0;
>>>   	}
>>> +	if (a == &dev_attr_inject_poison.attr) {
>>> +		struct device *dev = kobj_to_dev(kobj);
>>> +
>>> +		if (!test_bit(CXL_MEM_COMMAND_ID_INJECT_POISON,
>>> +			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
>>> +			return 0;
>>> +	}
>>>   	return a->mode;
>>>   }
>>>   
>>> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>>> index 19a9e545ac19..0d4c34be7335 100644
>>> --- a/drivers/cxl/cxlmem.h
>>> +++ b/drivers/cxl/cxlmem.h
>>> @@ -396,6 +396,9 @@ struct cxl_mbox_poison_payload_out {
>>>   #define CXL_POISON_SOURCE_INJECTED	3
>>>   #define CXL_POISON_SOURCE_VENDOR	7
>>>   
>>> +/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
>>> +#define CXL_POISON_INJECT_RESERVED	GENMASK_ULL(5, 0)
>>> +
>>>   /**
>>>    * struct cxl_mem_command - Driver representation of a memory device command
>>>    * @info: Command information as it exists for the UAPI
>>
>>
>
Dave Jiang Dec. 1, 2022, 5:26 p.m. UTC | #4
On 11/29/2022 9:34 PM, alison.schofield@intel.com wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices optionally support the INJECT POISON mailbox command. Add
> a sysfs attribute and memdev driver support for injecting poison.
> 
> When a Device Physical Address (DPA) is written to the inject_poison
> sysfs attribute send an inject poison command to the device for the

A comma between attribute and send would make this read better.

> specified address.
> 
> Per the CXL Specification (8.2.9.8.4.2), after receiving a valid

spec v3?

> inject poison request, the device will return poison when the address
> is accessed through the CXL.mem bus. Injecting poison adds the address
> to the device's Poison List and the error source is set to injected

comma before 'and'.
'the' before 'injected'

> error. In addition, the device adds a poison creation event to its
> internal Informational Event log, updates the Event Status register,
> and if configured, interrupts the host.
> 
> Also, per the CXL Specification, it is not an error to inject poison
> into an address that already has poison present and no error is returned

comma before 'and'.

DJ

> from the device. The memdev driver performs basic sanity checking on the
> address, however, it does not go as far as reading the poison list to see
> if the address is on the list. That discovery is left to the device.
> 
> The inject_poison attribute is only visible for devices supporting
> the capability.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>   Documentation/ABI/testing/sysfs-bus-cxl | 19 +++++++++
>   drivers/cxl/core/memdev.c               | 53 +++++++++++++++++++++++++
>   drivers/cxl/cxlmem.h                    |  3 ++
>   3 files changed, 75 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> index b715a4609718..20db97f7a1aa 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -416,3 +416,22 @@ Description:
>   		if accessed, and the source of the poison. The retrieved
>   		errors are logged as kernel trace events with the label
>   		'cxl_poison'.
> +
> +
> +What:		/sys/bus/cxl/devices/memX/inject_poison
> +Date:		December, 2022
> +KernelVersion:	v6.2
> +Contact:	linux-cxl@vger.kernel.org
> +Description:
> +		(WO) When a Device Physical Address (DPA) is written to this
> +		attribute the memdev driver sends an inject poison command to
> +		the device for the specified address. If successful, the device
> +		returns poison when the address is accessed through the CXL.mem
> +		bus. Injecting poison adds the address to the device's Poison
> +		List and the error source is set to injected error. In addition,
> +		the device adds a poison creation event to its internal
> +		Informational Event log, updates the Event Status register, and
> +		if configured, interrupts the host. It is not an error to inject
> +		poison into an address that already has poison present and no
> +		error is returned. The inject_poison attribute is only visible
> +                for devices supporting the capability.
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index d08b7295a01c..71130813030f 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -142,6 +142,51 @@ static ssize_t trigger_poison_list_store(struct device *dev,
>   }
>   static DEVICE_ATTR_WO(trigger_poison_list);
>   
> +static int cxl_validate_poison_dpa(struct cxl_dev_state *cxlds, u64 dpa)
> +{
> +	if (!resource_size(&cxlds->dpa_res)) {
> +		dev_dbg(cxlds->dev, "device has no dpa resource\n");
> +		return -EINVAL;
> +	}
> +	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
> +		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
> +			dpa, &cxlds->dpa_res);
> +		return -EINVAL;
> +	}
> +	if ((dpa & CXL_POISON_INJECT_RESERVED) != 0) {
> +		dev_dbg(cxlds->dev, "dpa reserve bit(s) [5:0] set 0x%llx\n",
> +			dpa);
> +		return -EINVAL;
> +	}
> +	return 0;
> +}
> +
> +static ssize_t inject_poison_store(struct device *dev,
> +				   struct device_attribute *attr,
> +				   const char *buf, size_t len)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	u64 dpa;
> +	int rc;
> +
> +	rc = kstrtou64(buf, 0, &dpa);
> +	if (rc)
> +		return rc;
> +	rc = cxl_validate_poison_dpa(cxlds, dpa);
> +	if (rc)
> +		return rc;
> +
> +	dpa = cpu_to_le64(dpa);
> +	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_INJECT_POISON, &dpa,
> +			       sizeof(dpa), NULL, cxlds->payload_size);
> +	if (rc)
> +		return rc;
> +
> +	return len;
> +}
> +static DEVICE_ATTR_WO(inject_poison);
> +
>   static struct attribute *cxl_memdev_attributes[] = {
>   	&dev_attr_serial.attr,
>   	&dev_attr_firmware_version.attr,
> @@ -149,6 +194,7 @@ static struct attribute *cxl_memdev_attributes[] = {
>   	&dev_attr_label_storage_size.attr,
>   	&dev_attr_numa_node.attr,
>   	&dev_attr_trigger_poison_list.attr,
> +	&dev_attr_inject_poison.attr,
>   	NULL,
>   };
>   
> @@ -175,6 +221,13 @@ static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
>   			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
>   			return 0;
>   	}
> +	if (a == &dev_attr_inject_poison.attr) {
> +		struct device *dev = kobj_to_dev(kobj);
> +
> +		if (!test_bit(CXL_MEM_COMMAND_ID_INJECT_POISON,
> +			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> +			return 0;
> +	}
>   	return a->mode;
>   }
>   
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 19a9e545ac19..0d4c34be7335 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -396,6 +396,9 @@ struct cxl_mbox_poison_payload_out {
>   #define CXL_POISON_SOURCE_INJECTED	3
>   #define CXL_POISON_SOURCE_VENDOR	7
>   
> +/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
> +#define CXL_POISON_INJECT_RESERVED	GENMASK_ULL(5, 0)
> +
>   /**
>    * struct cxl_mem_command - Driver representation of a memory device command
>    * @info: Command information as it exists for the UAPI
Dan Williams Dec. 4, 2022, 10:04 p.m. UTC | #5
alison.schofield@ wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices optionally support the INJECT POISON mailbox command. Add
> a sysfs attribute and memdev driver support for injecting poison.
> 
> When a Device Physical Address (DPA) is written to the inject_poison
> sysfs attribute send an inject poison command to the device for the
> specified address.
> 
> Per the CXL Specification (8.2.9.8.4.2), after receiving a valid
> inject poison request, the device will return poison when the address
> is accessed through the CXL.mem bus. Injecting poison adds the address
> to the device's Poison List and the error source is set to injected
> error. In addition, the device adds a poison creation event to its
> internal Informational Event log, updates the Event Status register,
> and if configured, interrupts the host.
> 
> Also, per the CXL Specification, it is not an error to inject poison
> into an address that already has poison present and no error is returned
> from the device. The memdev driver performs basic sanity checking on the
> address, however, it does not go as far as reading the poison list to see
> if the address is on the list. That discovery is left to the device.
> 
> The inject_poison attribute is only visible for devices supporting
> the capability.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  Documentation/ABI/testing/sysfs-bus-cxl | 19 +++++++++
>  drivers/cxl/core/memdev.c               | 53 +++++++++++++++++++++++++
>  drivers/cxl/cxlmem.h                    |  3 ++
>  3 files changed, 75 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> index b715a4609718..20db97f7a1aa 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -416,3 +416,22 @@ Description:
>  		if accessed, and the source of the poison. The retrieved
>  		errors are logged as kernel trace events with the label
>  		'cxl_poison'.
> +
> +
> +What:		/sys/bus/cxl/devices/memX/inject_poison
> +Date:		December, 2022
> +KernelVersion:	v6.2
> +Contact:	linux-cxl@vger.kernel.org
> +Description:
> +		(WO) When a Device Physical Address (DPA) is written to this
> +		attribute the memdev driver sends an inject poison command to
> +		the device for the specified address. If successful, the device
> +		returns poison when the address is accessed through the CXL.mem
> +		bus. Injecting poison adds the address to the device's Poison
> +		List and the error source is set to injected error. In addition,
> +		the device adds a poison creation event to its internal
> +		Informational Event log, updates the Event Status register, and
> +		if configured, interrupts the host. It is not an error to inject
> +		poison into an address that already has poison present and no
> +		error is returned. The inject_poison attribute is only visible

This description needs to clarify how much poison is injected and that
the address needs to be 64-byte aligned, or what happens if the address
is not aligned.

I also think this whole facility needs to have its own Kconfig symbol so
that it can be disabled at build time, and it likely needs to disable
itself at run time if Linux fails obtain CXL error reporting control
from the BIOS.

> +                for devices supporting the capability.

Stray indenting...

> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index d08b7295a01c..71130813030f 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -142,6 +142,51 @@ static ssize_t trigger_poison_list_store(struct device *dev,
>  }
>  static DEVICE_ATTR_WO(trigger_poison_list);
>  
> +static int cxl_validate_poison_dpa(struct cxl_dev_state *cxlds, u64 dpa)
> +{
> +	if (!resource_size(&cxlds->dpa_res)) {
> +		dev_dbg(cxlds->dev, "device has no dpa resource\n");
> +		return -EINVAL;
> +	}
> +	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
> +		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
> +			dpa, &cxlds->dpa_res);
> +		return -EINVAL;
> +	}
> +	if ((dpa & CXL_POISON_INJECT_RESERVED) != 0) {
> +		dev_dbg(cxlds->dev, "dpa reserve bit(s) [5:0] set 0x%llx\n",
> +			dpa);

I think it's less that those bits are reserved and more that poison is
tracked in terms of 64-byte aligned cache lines. So I would do:

    if (!IS_ALIGNED(dpa, 64))

> +		return -EINVAL;
> +	}
> +	return 0;
> +}
> +
> +static ssize_t inject_poison_store(struct device *dev,
> +				   struct device_attribute *attr,
> +				   const char *buf, size_t len)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	u64 dpa;
> +	int rc;
> +
> +	rc = kstrtou64(buf, 0, &dpa);
> +	if (rc)
> +		return rc;
> +	rc = cxl_validate_poison_dpa(cxlds, dpa);
> +	if (rc)
> +		return rc;
> +
> +	dpa = cpu_to_le64(dpa);
> +	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_INJECT_POISON, &dpa,
> +			       sizeof(dpa), NULL, cxlds->payload_size);

I would make it clear that the input payload is a single little endian
64-bit address with something like this:

struct cxl_mbox_poison_inject cmd;

cmd = (struct cxl_mbox_poison_inject) {
	.dpa = __le64_to_cpu(dpa),
};

...where cxl_mbox_poison_inject is:

struct cxl_mbox_poison_inject {
        __le64 dpa;
};

...otherwise its strange to assign the result of cpu_to_le64() to a u64,
I expect sparse would complain about that.

> +	if (rc)
> +		return rc;
> +
> +	return len;
> +}
> +static DEVICE_ATTR_WO(inject_poison);
> +
>  static struct attribute *cxl_memdev_attributes[] = {
>  	&dev_attr_serial.attr,
>  	&dev_attr_firmware_version.attr,
> @@ -149,6 +194,7 @@ static struct attribute *cxl_memdev_attributes[] = {
>  	&dev_attr_label_storage_size.attr,
>  	&dev_attr_numa_node.attr,
>  	&dev_attr_trigger_poison_list.attr,
> +	&dev_attr_inject_poison.attr,
>  	NULL,
>  };
>  
> @@ -175,6 +221,13 @@ static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
>  			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
>  			return 0;
>  	}
> +	if (a == &dev_attr_inject_poison.attr) {
> +		struct device *dev = kobj_to_dev(kobj);
> +
> +		if (!test_bit(CXL_MEM_COMMAND_ID_INJECT_POISON,
> +			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> +			return 0;
> +	}
>  	return a->mode;
>  }
>  
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 19a9e545ac19..0d4c34be7335 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -396,6 +396,9 @@ struct cxl_mbox_poison_payload_out {
>  #define CXL_POISON_SOURCE_INJECTED	3
>  #define CXL_POISON_SOURCE_VENDOR	7
>  
> +/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
> +#define CXL_POISON_INJECT_RESERVED	GENMASK_ULL(5, 0)
> +
>  /**
>   * struct cxl_mem_command - Driver representation of a memory device command
>   * @info: Command information as it exists for the UAPI
> -- 
> 2.37.3
>
Alison Schofield Dec. 8, 2022, 4:16 a.m. UTC | #6
On Sun, Dec 04, 2022 at 02:04:45PM -0800, Dan Williams wrote:
> alison.schofield@ wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices optionally support the INJECT POISON mailbox command. Add
> > a sysfs attribute and memdev driver support for injecting poison.
> > 
> > When a Device Physical Address (DPA) is written to the inject_poison
> > sysfs attribute send an inject poison command to the device for the
> > specified address.
> > 
> > Per the CXL Specification (8.2.9.8.4.2), after receiving a valid
> > inject poison request, the device will return poison when the address
> > is accessed through the CXL.mem bus. Injecting poison adds the address
> > to the device's Poison List and the error source is set to injected
> > error. In addition, the device adds a poison creation event to its
> > internal Informational Event log, updates the Event Status register,
> > and if configured, interrupts the host.
> > 
> > Also, per the CXL Specification, it is not an error to inject poison
> > into an address that already has poison present and no error is returned
> > from the device. The memdev driver performs basic sanity checking on the
> > address, however, it does not go as far as reading the poison list to see
> > if the address is on the list. That discovery is left to the device.
> > 
> > The inject_poison attribute is only visible for devices supporting
> > the capability.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > ---
> >  Documentation/ABI/testing/sysfs-bus-cxl | 19 +++++++++
> >  drivers/cxl/core/memdev.c               | 53 +++++++++++++++++++++++++
> >  drivers/cxl/cxlmem.h                    |  3 ++
> >  3 files changed, 75 insertions(+)
> > 
> > diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> > index b715a4609718..20db97f7a1aa 100644
> > --- a/Documentation/ABI/testing/sysfs-bus-cxl
> > +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> > @@ -416,3 +416,22 @@ Description:
> >  		if accessed, and the source of the poison. The retrieved
> >  		errors are logged as kernel trace events with the label
> >  		'cxl_poison'.
> > +
> > +
> > +What:		/sys/bus/cxl/devices/memX/inject_poison
> > +Date:		December, 2022
> > +KernelVersion:	v6.2
> > +Contact:	linux-cxl@vger.kernel.org
> > +Description:
> > +		(WO) When a Device Physical Address (DPA) is written to this
> > +		attribute the memdev driver sends an inject poison command to
> > +		the device for the specified address. If successful, the device
> > +		returns poison when the address is accessed through the CXL.mem
> > +		bus. Injecting poison adds the address to the device's Poison
> > +		List and the error source is set to injected error. In addition,
> > +		the device adds a poison creation event to its internal
> > +		Informational Event log, updates the Event Status register, and
> > +		if configured, interrupts the host. It is not an error to inject
> > +		poison into an address that already has poison present and no
> > +		error is returned. The inject_poison attribute is only visible
> 
> This description needs to clarify how much poison is injected and that
> the address needs to be 64-byte aligned, or what happens if the address
> is not aligned.

Will clarify!
Is 64 bytes and if it's not aligned, fail -EINVAL.

> 
> I also think this whole facility needs to have its own Kconfig symbol so
> that it can be disabled at build time, and it likely needs to disable
> itself at run time if Linux fails obtain CXL error reporting control
> from the BIOS.

wrt Kconfig and 'whole facility' I'm thinking the Kconfig control
is for Inject&Clear, and that you are not thinking Get Poison List and
Scan Media are part of the group that needs a Kconfig control.

I'm seeking clarification on the error reporting control - as in,
confirmation that the Poison commands need to acquire that control.

> 
> > +                for devices supporting the capability.
> 
> Stray indenting...
> 
> > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > index d08b7295a01c..71130813030f 100644
> > --- a/drivers/cxl/core/memdev.c
> > +++ b/drivers/cxl/core/memdev.c
> > @@ -142,6 +142,51 @@ static ssize_t trigger_poison_list_store(struct device *dev,
> >  }
> >  static DEVICE_ATTR_WO(trigger_poison_list);
> >  
> > +static int cxl_validate_poison_dpa(struct cxl_dev_state *cxlds, u64 dpa)
> > +{
> > +	if (!resource_size(&cxlds->dpa_res)) {
> > +		dev_dbg(cxlds->dev, "device has no dpa resource\n");
> > +		return -EINVAL;
> > +	}
> > +	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
> > +		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
> > +			dpa, &cxlds->dpa_res);
> > +		return -EINVAL;
> > +	}
> > +	if ((dpa & CXL_POISON_INJECT_RESERVED) != 0) {
> > +		dev_dbg(cxlds->dev, "dpa reserve bit(s) [5:0] set 0x%llx\n",
> > +			dpa);
> 
> I think it's less that those bits are reserved and more that poison is
> tracked in terms of 64-byte aligned cache lines. So I would do:
> 
>     if (!IS_ALIGNED(dpa, 64))
> 

:) thanks!


> > +		return -EINVAL;
> > +	}
> > +	return 0;
> > +}
> > +
> > +static ssize_t inject_poison_store(struct device *dev,
> > +				   struct device_attribute *attr,
> > +				   const char *buf, size_t len)
> > +{
> > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +	u64 dpa;
> > +	int rc;
> > +
> > +	rc = kstrtou64(buf, 0, &dpa);
> > +	if (rc)
> > +		return rc;
> > +	rc = cxl_validate_poison_dpa(cxlds, dpa);
> > +	if (rc)
> > +		return rc;
> > +
> > +	dpa = cpu_to_le64(dpa);
> > +	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_INJECT_POISON, &dpa,
> > +			       sizeof(dpa), NULL, cxlds->payload_size);
> 
> I would make it clear that the input payload is a single little endian
> 64-bit address with something like this:
> 
> struct cxl_mbox_poison_inject cmd;
> 
> cmd = (struct cxl_mbox_poison_inject) {
> 	.dpa = __le64_to_cpu(dpa),
> };
> 
> ...where cxl_mbox_poison_inject is:
> 
> struct cxl_mbox_poison_inject {
>         __le64 dpa;
> };
> 
> ...otherwise its strange to assign the result of cpu_to_le64() to a u64,
> I expect sparse would complain about that.

I did fix up the sparse warning but not w a struct like you suggest.
I was treating this input payload unfairly and unclearly by not giving
it it's own struct definition. I will give it a struct!

> 
> > +	if (rc)
> > +		return rc;
> > +
> > +	return len;
> > +}
> > +static DEVICE_ATTR_WO(inject_poison);
> > +
> >  static struct attribute *cxl_memdev_attributes[] = {
> >  	&dev_attr_serial.attr,
> >  	&dev_attr_firmware_version.attr,
> > @@ -149,6 +194,7 @@ static struct attribute *cxl_memdev_attributes[] = {
> >  	&dev_attr_label_storage_size.attr,
> >  	&dev_attr_numa_node.attr,
> >  	&dev_attr_trigger_poison_list.attr,
> > +	&dev_attr_inject_poison.attr,
> >  	NULL,
> >  };
> >  
> > @@ -175,6 +221,13 @@ static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
> >  			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> >  			return 0;
> >  	}
> > +	if (a == &dev_attr_inject_poison.attr) {
> > +		struct device *dev = kobj_to_dev(kobj);
> > +
> > +		if (!test_bit(CXL_MEM_COMMAND_ID_INJECT_POISON,
> > +			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> > +			return 0;
> > +	}
> >  	return a->mode;
> >  }
> >  
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 19a9e545ac19..0d4c34be7335 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -396,6 +396,9 @@ struct cxl_mbox_poison_payload_out {
> >  #define CXL_POISON_SOURCE_INJECTED	3
> >  #define CXL_POISON_SOURCE_VENDOR	7
> >  
> > +/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
> > +#define CXL_POISON_INJECT_RESERVED	GENMASK_ULL(5, 0)
> > +
> >  /**
> >   * struct cxl_mem_command - Driver representation of a memory device command
> >   * @info: Command information as it exists for the UAPI
> > -- 
> > 2.37.3
> > 
> 
>
Alison Schofield Dec. 8, 2022, 4:17 a.m. UTC | #7
On Thu, Dec 01, 2022 at 10:26:16AM -0700, Dave Jiang wrote:
> 
> 
> On 11/29/2022 9:34 PM, alison.schofield@intel.com wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices optionally support the INJECT POISON mailbox command. Add
> > a sysfs attribute and memdev driver support for injecting poison.
> > 
> > When a Device Physical Address (DPA) is written to the inject_poison
> > sysfs attribute send an inject poison command to the device for the
> 
> A comma between attribute and send would make this read better.
> 
> > specified address.
> > 
> > Per the CXL Specification (8.2.9.8.4.2), after receiving a valid
> 
> spec v3?
> 
> > inject poison request, the device will return poison when the address
> > is accessed through the CXL.mem bus. Injecting poison adds the address
> > to the device's Poison List and the error source is set to injected
> 
> comma before 'and'.
> 'the' before 'injected'
> 
> > error. In addition, the device adds a poison creation event to its
> > internal Informational Event log, updates the Event Status register,
> > and if configured, interrupts the host.
> > 
> > Also, per the CXL Specification, it is not an error to inject poison
> > into an address that already has poison present and no error is returned
> 
> comma before 'and'.
> 
> DJ

Thanks Dave!
> 
snip
> >
Alison Schofield Dec. 8, 2022, 4:20 a.m. UTC | #8
On Thu, Dec 01, 2022 at 09:42:01AM -0700, Dave Jiang wrote:
> 
> 
> On 11/30/2022 7:40 AM, Jonathan Cameron wrote:
> > On Wed, 30 Nov 2022 14:31:36 +0000
> > Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
> > 
> > > On Tue, 29 Nov 2022 20:34:33 -0800
> > > alison.schofield@intel.com wrote:
> > > 
> > > > From: Alison Schofield <alison.schofield@intel.com>
> > > > 
> > > > CXL devices optionally support the INJECT POISON mailbox command. Add
> > > > a sysfs attribute and memdev driver support for injecting poison.
> > > > 
> > > > When a Device Physical Address (DPA) is written to the inject_poison
> > > > sysfs attribute send an inject poison command to the device for the
> > > > specified address.
> > > > 
> > > > Per the CXL Specification (8.2.9.8.4.2), after receiving a valid
> > > > inject poison request, the device will return poison when the address
> > > > is accessed through the CXL.mem bus. Injecting poison adds the address
> > > > to the device's Poison List and the error source is set to injected
> > > > error. In addition, the device adds a poison creation event to its
> > > > internal Informational Event log, updates the Event Status register,
> > > > and if configured, interrupts the host.
> > > > 
> > > > Also, per the CXL Specification, it is not an error to inject poison
> > > > into an address that already has poison present and no error is returned
> > > > from the device. The memdev driver performs basic sanity checking on the
> > > > address, however, it does not go as far as reading the poison list to see
> > > > if the address is on the list. That discovery is left to the device.
> > > > 
> > > > The inject_poison attribute is only visible for devices supporting
> > > > the capability.
> > > > 
> > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > A few trivial things inline.  With those fixes LGTM
> > > 
> > > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > 
> > > > ---
> > > >   Documentation/ABI/testing/sysfs-bus-cxl | 19 +++++++++
> > > >   drivers/cxl/core/memdev.c               | 53 +++++++++++++++++++++++++
> > > >   drivers/cxl/cxlmem.h                    |  3 ++
> > > >   3 files changed, 75 insertions(+)
> > > > 
> > > > diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> > > > index b715a4609718..20db97f7a1aa 100644
> > > > --- a/Documentation/ABI/testing/sysfs-bus-cxl
> > > > +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> > > > @@ -416,3 +416,22 @@ Description:
> > > >   		if accessed, and the source of the poison. The retrieved
> > > >   		errors are logged as kernel trace events with the label
> > > >   		'cxl_poison'.
> > > > +
> > > > +
> > > > +What:		/sys/bus/cxl/devices/memX/inject_poison
> > > > +Date:		December, 2022
> > > > +KernelVersion:	v6.2
> > > > +Contact:	linux-cxl@vger.kernel.org
> > > > +Description:
> > > > +		(WO) When a Device Physical Address (DPA) is written to this
> > > > +		attribute the memdev driver sends an inject poison command to
> > > > +		the device for the specified address. If successful, the device
> > > > +		returns poison when the address is accessed through the CXL.mem
> > > > +		bus. Injecting poison adds the address to the device's Poison
> > > > +		List and the error source is set to injected error. In addition,
> > > "set to Injected."
> > > 
> > > perhaps to match spec naming in Media Error Record.
> > > 
> > > > +		the device adds a poison creation event to its internal
> > > > +		Informational Event log, updates the Event Status register, and
> > > > +		if configured, interrupts the host. It is not an error to inject
> > > > +		poison into an address that already has poison present and no
> > > > +		error is returned. The inject_poison attribute is only visible
> > > > +                for devices supporting the capability.
> > > 
> > > White space issues (spaces instead of tabs?)
> > > 
> > > Add something about the masked bits / granularity of addresses that are accepted.
> > > 
> > > > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > > > index d08b7295a01c..71130813030f 100644
> > > > --- a/drivers/cxl/core/memdev.c
> > > > +++ b/drivers/cxl/core/memdev.c
> > > > @@ -142,6 +142,51 @@ static ssize_t trigger_poison_list_store(struct device *dev,
> > > >   }
> > > >   static DEVICE_ATTR_WO(trigger_poison_list);
> > > > +static int cxl_validate_poison_dpa(struct cxl_dev_state *cxlds, u64 dpa)
> > > > +{
> > > > +	if (!resource_size(&cxlds->dpa_res)) {
> > > > +		dev_dbg(cxlds->dev, "device has no dpa resource\n");
> > > > +		return -EINVAL;
> > > > +	}
> > > > +	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
> > > > +		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
> > > > +			dpa, &cxlds->dpa_res);
> > > > +		return -EINVAL;
> > > > +	}
> > > > +	if ((dpa & CXL_POISON_INJECT_RESERVED) != 0) {
> > > > +		dev_dbg(cxlds->dev, "dpa reserve bit(s) [5:0] set 0x%llx\n",
> > > > +			dpa);
> > > > +		return -EINVAL;
> > > > +	}
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static ssize_t inject_poison_store(struct device *dev,
> > > > +				   struct device_attribute *attr,
> > > > +				   const char *buf, size_t len)
> > > > +{
> > > > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > > > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > > > +	u64 dpa;
> > > > +	int rc;
> > > > +
> > > > +	rc = kstrtou64(buf, 0, &dpa);
> > > > +	if (rc)
> > > > +		return rc;
> > > > +	rc = cxl_validate_poison_dpa(cxlds, dpa);
> > > > +	if (rc)
> > > > +		return rc;
> > > > +
> > > > +	dpa = cpu_to_le64(dpa);
> > > > +	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_INJECT_POISON, &dpa,
> > > 
> > > Endianness?
> > 
> > Got thrown by the type and missed the cpu_to_le64().  Use a local __le64 so it's all explicit.
> > One of the static analysis tools will correctly moan about storing it to a u64
> > (can't remember which).
> 
> Sparse. I just got yelled at by 0-day for something similar. :)
> 

Thanks for all your endian-ness eyes.

> > 
> > > 
> > > > +			       sizeof(dpa), NULL, cxlds->payload_size);
> > > > +	if (rc)
> > > > +		return rc;
> > > > +
> > > > +	return len;
> > > > +}
> > > > +static DEVICE_ATTR_WO(inject_poison);
> > > > +
> > > >   static struct attribute *cxl_memdev_attributes[] = {
> > > >   	&dev_attr_serial.attr,
> > > >   	&dev_attr_firmware_version.attr,
> > > > @@ -149,6 +194,7 @@ static struct attribute *cxl_memdev_attributes[] = {
> > > >   	&dev_attr_label_storage_size.attr,
> > > >   	&dev_attr_numa_node.attr,
> > > >   	&dev_attr_trigger_poison_list.attr,
> > > > +	&dev_attr_inject_poison.attr,
> > > >   	NULL,
> > > >   };
> > > > @@ -175,6 +221,13 @@ static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
> > > >   			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> > > >   			return 0;
> > > >   	}
> > > > +	if (a == &dev_attr_inject_poison.attr) {
> > > > +		struct device *dev = kobj_to_dev(kobj);
> > > > +
> > > > +		if (!test_bit(CXL_MEM_COMMAND_ID_INJECT_POISON,
> > > > +			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
> > > > +			return 0;
> > > > +	}
> > > >   	return a->mode;
> > > >   }
> > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > > index 19a9e545ac19..0d4c34be7335 100644
> > > > --- a/drivers/cxl/cxlmem.h
> > > > +++ b/drivers/cxl/cxlmem.h
> > > > @@ -396,6 +396,9 @@ struct cxl_mbox_poison_payload_out {
> > > >   #define CXL_POISON_SOURCE_INJECTED	3
> > > >   #define CXL_POISON_SOURCE_VENDOR	7
> > > > +/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
> > > > +#define CXL_POISON_INJECT_RESERVED	GENMASK_ULL(5, 0)
> > > > +
> > > >   /**
> > > >    * struct cxl_mem_command - Driver representation of a memory device command
> > > >    * @info: Command information as it exists for the UAPI
> > > 
> > > 
> >
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index b715a4609718..20db97f7a1aa 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -416,3 +416,22 @@  Description:
 		if accessed, and the source of the poison. The retrieved
 		errors are logged as kernel trace events with the label
 		'cxl_poison'.
+
+
+What:		/sys/bus/cxl/devices/memX/inject_poison
+Date:		December, 2022
+KernelVersion:	v6.2
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(WO) When a Device Physical Address (DPA) is written to this
+		attribute the memdev driver sends an inject poison command to
+		the device for the specified address. If successful, the device
+		returns poison when the address is accessed through the CXL.mem
+		bus. Injecting poison adds the address to the device's Poison
+		List and the error source is set to injected error. In addition,
+		the device adds a poison creation event to its internal
+		Informational Event log, updates the Event Status register, and
+		if configured, interrupts the host. It is not an error to inject
+		poison into an address that already has poison present and no
+		error is returned. The inject_poison attribute is only visible
+                for devices supporting the capability.
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index d08b7295a01c..71130813030f 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -142,6 +142,51 @@  static ssize_t trigger_poison_list_store(struct device *dev,
 }
 static DEVICE_ATTR_WO(trigger_poison_list);
 
+static int cxl_validate_poison_dpa(struct cxl_dev_state *cxlds, u64 dpa)
+{
+	if (!resource_size(&cxlds->dpa_res)) {
+		dev_dbg(cxlds->dev, "device has no dpa resource\n");
+		return -EINVAL;
+	}
+	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
+		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
+			dpa, &cxlds->dpa_res);
+		return -EINVAL;
+	}
+	if ((dpa & CXL_POISON_INJECT_RESERVED) != 0) {
+		dev_dbg(cxlds->dev, "dpa reserve bit(s) [5:0] set 0x%llx\n",
+			dpa);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static ssize_t inject_poison_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t len)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	u64 dpa;
+	int rc;
+
+	rc = kstrtou64(buf, 0, &dpa);
+	if (rc)
+		return rc;
+	rc = cxl_validate_poison_dpa(cxlds, dpa);
+	if (rc)
+		return rc;
+
+	dpa = cpu_to_le64(dpa);
+	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_INJECT_POISON, &dpa,
+			       sizeof(dpa), NULL, cxlds->payload_size);
+	if (rc)
+		return rc;
+
+	return len;
+}
+static DEVICE_ATTR_WO(inject_poison);
+
 static struct attribute *cxl_memdev_attributes[] = {
 	&dev_attr_serial.attr,
 	&dev_attr_firmware_version.attr,
@@ -149,6 +194,7 @@  static struct attribute *cxl_memdev_attributes[] = {
 	&dev_attr_label_storage_size.attr,
 	&dev_attr_numa_node.attr,
 	&dev_attr_trigger_poison_list.attr,
+	&dev_attr_inject_poison.attr,
 	NULL,
 };
 
@@ -175,6 +221,13 @@  static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
 			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
 			return 0;
 	}
+	if (a == &dev_attr_inject_poison.attr) {
+		struct device *dev = kobj_to_dev(kobj);
+
+		if (!test_bit(CXL_MEM_COMMAND_ID_INJECT_POISON,
+			      to_cxl_memdev(dev)->cxlds->enabled_cmds))
+			return 0;
+	}
 	return a->mode;
 }
 
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 19a9e545ac19..0d4c34be7335 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -396,6 +396,9 @@  struct cxl_mbox_poison_payload_out {
 #define CXL_POISON_SOURCE_INJECTED	3
 #define CXL_POISON_SOURCE_VENDOR	7
 
+/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
+#define CXL_POISON_INJECT_RESERVED	GENMASK_ULL(5, 0)
+
 /**
  * struct cxl_mem_command - Driver representation of a memory device command
  * @info: Command information as it exists for the UAPI