diff mbox series

[v5,02/12] cxl/memdev: Add support for the Clear Poison mailbox command

Message ID 548e2a175a2f20cdc886297430102ee851d30f26.1679892337.git.alison.schofield@intel.com
State Superseded
Headers show
Series cxl: CXL Inject & Clear Poison | expand

Commit Message

Alison Schofield March 27, 2023, 5:03 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

CXL devices optionally support the CLEAR POISON mailbox command. Add
memdev driver support for clearing poison.

Per the CXL Specification (3.0 8.2.9.8.4.3), after receiving a valid
clear poison request, the device removes the address from the device's
Poison List and writes 0 (zero) for 64 bytes starting at address. If
the device cannot clear poison from the address, it returns a permanent
media error and -ENXIO is returned to the user.

Additionally, and per the spec also, it is not an error to clear poison
of an address that is not poisoned. In this case, the device does not
overwrite the address and the device does not return an error.

If the address is not contained in the device's dpa resource, or is
not 64 byte aligned, return -EINVAL without issuing the mbox command.

Poison clearing is intended for debug only and will be exposed to
userspace through debugfs. Restrict compilation to CONFIG_DEBUG_FS.

Implementation note: Although the CXL specification defines the clear
command to accept 64 bytes of 'write-data' to be used when clearing
the poisoned address, this implementation always uses 0 (zeros) for
the write-data.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 drivers/cxl/core/memdev.c | 43 +++++++++++++++++++++++++++++++++++++++
 drivers/cxl/cxlmem.h      |  7 +++++++
 2 files changed, 50 insertions(+)

Comments

Jonathan Cameron March 30, 2023, 6:50 p.m. UTC | #1
On Sun, 26 Mar 2023 22:03:08 -0700
alison.schofield@intel.com wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices optionally support the CLEAR POISON mailbox command. Add
> memdev driver support for clearing poison.
> 
> Per the CXL Specification (3.0 8.2.9.8.4.3), after receiving a valid
> clear poison request, the device removes the address from the device's
> Poison List and writes 0 (zero) for 64 bytes starting at address. If
> the device cannot clear poison from the address, it returns a permanent
> media error and -ENXIO is returned to the user.
> 
> Additionally, and per the spec also, it is not an error to clear poison
> of an address that is not poisoned. In this case, the device does not
> overwrite the address and the device does not return an error.

That's not inline with the spec.

"Clear Poison Write Data: The data the device shall always write into the
requested physical address, atomically, while clearing poison if the location
is marked as being poisoned."

The overwrite always happens whether or not it's poisoned.

Other than that
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> 
> If the address is not contained in the device's dpa resource, or is
> not 64 byte aligned, return -EINVAL without issuing the mbox command.
> 
> Poison clearing is intended for debug only and will be exposed to
> userspace through debugfs. Restrict compilation to CONFIG_DEBUG_FS.
> 
> Implementation note: Although the CXL specification defines the clear
> command to accept 64 bytes of 'write-data' to be used when clearing
> the poisoned address, this implementation always uses 0 (zeros) for
> the write-data.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/cxl/core/memdev.c | 43 +++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/cxlmem.h      |  7 +++++++
>  2 files changed, 50 insertions(+)
> 
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index 3b3ac2868848..0e39c3c3fb09 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -268,6 +268,49 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, CXL);
>  
> +int cxl_clear_poison(struct device *dev, u64 dpa)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_mbox_clear_poison clear;
> +	struct cxl_mbox_cmd mbox_cmd;
> +	int rc;
> +
> +	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> +		return 0;
> +
> +	down_read(&cxl_dpa_rwsem);
> +	rc = cxl_validate_poison_dpa(cxlmd, dpa);
> +	if (rc)
> +		goto out;
> +
> +	/*
> +	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
> +	 * is defined to accept 64 bytes of 'write-data', along with the
> +	 * address to clear. The device writes the data into the address
> +	 * atomically, while clearing poison if the location is marked as
> +	 * being poisoned.

This description is correct.

> +	 *
> +	 * Always use '0' for the write-data.
> +	 */
> +	clear = (struct cxl_mbox_clear_poison) {
> +		.address = cpu_to_le64(dpa)
> +	};
> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = CXL_MBOX_OP_CLEAR_POISON,
> +		.size_in = sizeof(clear),
> +		.payload_in = &clear,
> +	};
> +
> +	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> +
> +out:
> +	up_read(&cxl_dpa_rwsem);
> +
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, CXL);
> +
>  static struct attribute *cxl_memdev_attributes[] = {
>  	&dev_attr_serial.attr,
>  	&dev_attr_firmware_version.attr,
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 527efef2d700..1d8677ab2306 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -607,6 +607,12 @@ struct cxl_mbox_inject_poison {
>  	__le64 address;
>  };
>  
> +/* Clear Poison  CXL 3.0 Spec 8.2.9.8.4.3 */
> +struct cxl_mbox_clear_poison {
> +	__le64 address;
> +	u8 write_data[CXL_POISON_LEN_MULT];
> +} __packed;
> +
>  /**
>   * struct cxl_mem_command - Driver representation of a memory device command
>   * @info: Command information as it exists for the UAPI
> @@ -684,6 +690,7 @@ ssize_t cxl_trigger_poison_list(struct device *dev,
>  				struct device_attribute *attr, const char *buf,
>  				size_t len);
>  int cxl_inject_poison(struct device *dev, u64 dpa);
> +int cxl_clear_poison(struct device *dev, u64 dpa);
>  
>  #ifdef CONFIG_CXL_SUSPEND
>  void cxl_mem_active_inc(void);
Alison Schofield March 30, 2023, 8:12 p.m. UTC | #2
On Thu, Mar 30, 2023 at 11:50:18AM -0700, Jonathan Cameron wrote:
> On Sun, 26 Mar 2023 22:03:08 -0700
> alison.schofield@intel.com wrote:
> 
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices optionally support the CLEAR POISON mailbox command. Add
> > memdev driver support for clearing poison.
> > 
> > Per the CXL Specification (3.0 8.2.9.8.4.3), after receiving a valid
> > clear poison request, the device removes the address from the device's
> > Poison List and writes 0 (zero) for 64 bytes starting at address. If
> > the device cannot clear poison from the address, it returns a permanent
> > media error and -ENXIO is returned to the user.
> > 
> > Additionally, and per the spec also, it is not an error to clear poison
> > of an address that is not poisoned. In this case, the device does not
> > overwrite the address and the device does not return an error.
> 
> That's not inline with the spec.
> 
> "Clear Poison Write Data: The data the device shall always write into the
> requested physical address, atomically, while clearing poison if the location
> is marked as being poisoned."
> 
> The overwrite always happens whether or not it's poisoned.

Jonathan,

I read that with an emphasis on that final 'if' clause:
"The data the device shall always write (...blah blah blah...) if the
location is marked as being poisoned.

So, if the location was not marked as being poisoned, the device won't
write anything.

Which means, the user cannot use the Clear command to randomly write stuff
wherever they please.

What do you think of that ? 

Alison


> Other than that
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> > 
> > If the address is not contained in the device's dpa resource, or is
> > not 64 byte aligned, return -EINVAL without issuing the mbox command.
> > 
> > Poison clearing is intended for debug only and will be exposed to
> > userspace through debugfs. Restrict compilation to CONFIG_DEBUG_FS.
> > 
> > Implementation note: Although the CXL specification defines the clear
> > command to accept 64 bytes of 'write-data' to be used when clearing
> > the poisoned address, this implementation always uses 0 (zeros) for
> > the write-data.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > ---
> >  drivers/cxl/core/memdev.c | 43 +++++++++++++++++++++++++++++++++++++++
> >  drivers/cxl/cxlmem.h      |  7 +++++++
> >  2 files changed, 50 insertions(+)
> > 
> > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > index 3b3ac2868848..0e39c3c3fb09 100644
> > --- a/drivers/cxl/core/memdev.c
> > +++ b/drivers/cxl/core/memdev.c
> > @@ -268,6 +268,49 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, CXL);
> >  
> > +int cxl_clear_poison(struct device *dev, u64 dpa)
> > +{
> > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > +	struct cxl_mbox_clear_poison clear;
> > +	struct cxl_mbox_cmd mbox_cmd;
> > +	int rc;
> > +
> > +	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> > +		return 0;
> > +
> > +	down_read(&cxl_dpa_rwsem);
> > +	rc = cxl_validate_poison_dpa(cxlmd, dpa);
> > +	if (rc)
> > +		goto out;
> > +
> > +	/*
> > +	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
> > +	 * is defined to accept 64 bytes of 'write-data', along with the
> > +	 * address to clear. The device writes the data into the address
> > +	 * atomically, while clearing poison if the location is marked as
> > +	 * being poisoned.
> 
> This description is correct.
> 
> > +	 *
> > +	 * Always use '0' for the write-data.
> > +	 */
> > +	clear = (struct cxl_mbox_clear_poison) {
> > +		.address = cpu_to_le64(dpa)
> > +	};
> > +
> > +	mbox_cmd = (struct cxl_mbox_cmd) {
> > +		.opcode = CXL_MBOX_OP_CLEAR_POISON,
> > +		.size_in = sizeof(clear),
> > +		.payload_in = &clear,
> > +	};
> > +
> > +	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> > +
> > +out:
> > +	up_read(&cxl_dpa_rwsem);
> > +
> > +	return rc;
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, CXL);
> > +
> >  static struct attribute *cxl_memdev_attributes[] = {
> >  	&dev_attr_serial.attr,
> >  	&dev_attr_firmware_version.attr,
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 527efef2d700..1d8677ab2306 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -607,6 +607,12 @@ struct cxl_mbox_inject_poison {
> >  	__le64 address;
> >  };
> >  
> > +/* Clear Poison  CXL 3.0 Spec 8.2.9.8.4.3 */
> > +struct cxl_mbox_clear_poison {
> > +	__le64 address;
> > +	u8 write_data[CXL_POISON_LEN_MULT];
> > +} __packed;
> > +
> >  /**
> >   * struct cxl_mem_command - Driver representation of a memory device command
> >   * @info: Command information as it exists for the UAPI
> > @@ -684,6 +690,7 @@ ssize_t cxl_trigger_poison_list(struct device *dev,
> >  				struct device_attribute *attr, const char *buf,
> >  				size_t len);
> >  int cxl_inject_poison(struct device *dev, u64 dpa);
> > +int cxl_clear_poison(struct device *dev, u64 dpa);
> >  
> >  #ifdef CONFIG_CXL_SUSPEND
> >  void cxl_mem_active_inc(void);
>
Dave Jiang March 31, 2023, 6:40 p.m. UTC | #3
On 3/26/23 10:03 PM, alison.schofield@intel.com wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices optionally support the CLEAR POISON mailbox command. Add
> memdev driver support for clearing poison.
> 
> Per the CXL Specification (3.0 8.2.9.8.4.3), after receiving a valid
> clear poison request, the device removes the address from the device's
> Poison List and writes 0 (zero) for 64 bytes starting at address. If
> the device cannot clear poison from the address, it returns a permanent
> media error and -ENXIO is returned to the user.
> 
> Additionally, and per the spec also, it is not an error to clear poison
> of an address that is not poisoned. In this case, the device does not
> overwrite the address and the device does not return an error.
> 
> If the address is not contained in the device's dpa resource, or is
> not 64 byte aligned, return -EINVAL without issuing the mbox command.
> 
> Poison clearing is intended for debug only and will be exposed to
> userspace through debugfs. Restrict compilation to CONFIG_DEBUG_FS.
> 
> Implementation note: Although the CXL specification defines the clear
> command to accept 64 bytes of 'write-data' to be used when clearing
> the poisoned address, this implementation always uses 0 (zeros) for
> the write-data.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>   drivers/cxl/core/memdev.c | 43 +++++++++++++++++++++++++++++++++++++++
>   drivers/cxl/cxlmem.h      |  7 +++++++
>   2 files changed, 50 insertions(+)
> 
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index 3b3ac2868848..0e39c3c3fb09 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -268,6 +268,49 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
>   }
>   EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, CXL);
>   
> +int cxl_clear_poison(struct device *dev, u64 dpa)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_mbox_clear_poison clear;
> +	struct cxl_mbox_cmd mbox_cmd;
> +	int rc;
> +
> +	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> +		return 0;
> +
> +	down_read(&cxl_dpa_rwsem);
> +	rc = cxl_validate_poison_dpa(cxlmd, dpa);
> +	if (rc)
> +		goto out;
> +
> +	/*
> +	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
> +	 * is defined to accept 64 bytes of 'write-data', along with the
> +	 * address to clear. The device writes the data into the address
> +	 * atomically, while clearing poison if the location is marked as
> +	 * being poisoned.
> +	 *
> +	 * Always use '0' for the write-data.
> +	 */
> +	clear = (struct cxl_mbox_clear_poison) {
> +		.address = cpu_to_le64(dpa)
> +	};

The write_data[] should be 0s in order to clear the poison right? Since 
'clear' is allocated on the stack, if it's not initialized then it would 
be random garbage in the data. You could just init all 'clear' members 
when you declare the variable at top if you like.

DJ

> +
> +	mbox_cmd = (struct cxl_mbox_cmd) {
> +		.opcode = CXL_MBOX_OP_CLEAR_POISON,
> +		.size_in = sizeof(clear),
> +		.payload_in = &clear,
> +	};
> +
> +	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> +
> +out:
> +	up_read(&cxl_dpa_rwsem);
> +
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, CXL);
> +
>   static struct attribute *cxl_memdev_attributes[] = {
>   	&dev_attr_serial.attr,
>   	&dev_attr_firmware_version.attr,
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 527efef2d700..1d8677ab2306 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -607,6 +607,12 @@ struct cxl_mbox_inject_poison {
>   	__le64 address;
>   };
>   
> +/* Clear Poison  CXL 3.0 Spec 8.2.9.8.4.3 */
> +struct cxl_mbox_clear_poison {
> +	__le64 address;
> +	u8 write_data[CXL_POISON_LEN_MULT];
> +} __packed;
> +
>   /**
>    * struct cxl_mem_command - Driver representation of a memory device command
>    * @info: Command information as it exists for the UAPI
> @@ -684,6 +690,7 @@ ssize_t cxl_trigger_poison_list(struct device *dev,
>   				struct device_attribute *attr, const char *buf,
>   				size_t len);
>   int cxl_inject_poison(struct device *dev, u64 dpa);
> +int cxl_clear_poison(struct device *dev, u64 dpa);
>   
>   #ifdef CONFIG_CXL_SUSPEND
>   void cxl_mem_active_inc(void);
Alison Schofield March 31, 2023, 7:55 p.m. UTC | #4
On Fri, Mar 31, 2023 at 11:40:01AM -0700, Dave Jiang wrote:
> 
> 
> On 3/26/23 10:03 PM, alison.schofield@intel.com wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices optionally support the CLEAR POISON mailbox command. Add
> > memdev driver support for clearing poison.
> > 
> > Per the CXL Specification (3.0 8.2.9.8.4.3), after receiving a valid
> > clear poison request, the device removes the address from the device's
> > Poison List and writes 0 (zero) for 64 bytes starting at address. If
> > the device cannot clear poison from the address, it returns a permanent
> > media error and -ENXIO is returned to the user.
> > 
> > Additionally, and per the spec also, it is not an error to clear poison
> > of an address that is not poisoned. In this case, the device does not
> > overwrite the address and the device does not return an error.
> > 
> > If the address is not contained in the device's dpa resource, or is
> > not 64 byte aligned, return -EINVAL without issuing the mbox command.
> > 
> > Poison clearing is intended for debug only and will be exposed to
> > userspace through debugfs. Restrict compilation to CONFIG_DEBUG_FS.
> > 
> > Implementation note: Although the CXL specification defines the clear
> > command to accept 64 bytes of 'write-data' to be used when clearing
> > the poisoned address, this implementation always uses 0 (zeros) for
> > the write-data.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > ---
> >   drivers/cxl/core/memdev.c | 43 +++++++++++++++++++++++++++++++++++++++
> >   drivers/cxl/cxlmem.h      |  7 +++++++
> >   2 files changed, 50 insertions(+)
> > 
> > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > index 3b3ac2868848..0e39c3c3fb09 100644
> > --- a/drivers/cxl/core/memdev.c
> > +++ b/drivers/cxl/core/memdev.c
> > @@ -268,6 +268,49 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
> >   }
> >   EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, CXL);
> > +int cxl_clear_poison(struct device *dev, u64 dpa)
> > +{
> > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > +	struct cxl_mbox_clear_poison clear;
> > +	struct cxl_mbox_cmd mbox_cmd;
> > +	int rc;
> > +
> > +	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> > +		return 0;
> > +
> > +	down_read(&cxl_dpa_rwsem);
> > +	rc = cxl_validate_poison_dpa(cxlmd, dpa);
> > +	if (rc)
> > +		goto out;
> > +
> > +	/*
> > +	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
> > +	 * is defined to accept 64 bytes of 'write-data', along with the
> > +	 * address to clear. The device writes the data into the address
> > +	 * atomically, while clearing poison if the location is marked as
> > +	 * being poisoned.
> > +	 *
> > +	 * Always use '0' for the write-data.
> > +	 */
> > +	clear = (struct cxl_mbox_clear_poison) {
> > +		.address = cpu_to_le64(dpa)
> > +	};
> 
> The write_data[] should be 0s in order to clear the poison right? Since
> 'clear' is allocated on the stack, if it's not initialized then it would be
> random garbage in the data. You could just init all 'clear' members when you
> declare the variable at top if you like.

Declaring like this initializes any unspecified fields to zero.
This is the same initialization used across all the mbox_cmd setups
here and in core/mbox.c. 

Am I using that construct incorrectly?

> 
> DJ
> 
> > +
> > +	mbox_cmd = (struct cxl_mbox_cmd) {
> > +		.opcode = CXL_MBOX_OP_CLEAR_POISON,
> > +		.size_in = sizeof(clear),
> > +		.payload_in = &clear,
> > +	};
> > +
> > +	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> > +
> > +out:
> > +	up_read(&cxl_dpa_rwsem);
> > +
> > +	return rc;
> > +}
> > +EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, CXL);
> > +
> >   static struct attribute *cxl_memdev_attributes[] = {
> >   	&dev_attr_serial.attr,
> >   	&dev_attr_firmware_version.attr,
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 527efef2d700..1d8677ab2306 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -607,6 +607,12 @@ struct cxl_mbox_inject_poison {
> >   	__le64 address;
> >   };
> > +/* Clear Poison  CXL 3.0 Spec 8.2.9.8.4.3 */
> > +struct cxl_mbox_clear_poison {
> > +	__le64 address;
> > +	u8 write_data[CXL_POISON_LEN_MULT];
> > +} __packed;
> > +
> >   /**
> >    * struct cxl_mem_command - Driver representation of a memory device command
> >    * @info: Command information as it exists for the UAPI
> > @@ -684,6 +690,7 @@ ssize_t cxl_trigger_poison_list(struct device *dev,
> >   				struct device_attribute *attr, const char *buf,
> >   				size_t len);
> >   int cxl_inject_poison(struct device *dev, u64 dpa);
> > +int cxl_clear_poison(struct device *dev, u64 dpa);
> >   #ifdef CONFIG_CXL_SUSPEND
> >   void cxl_mem_active_inc(void);
Dave Jiang March 31, 2023, 9:18 p.m. UTC | #5
On 3/31/23 12:55 PM, Alison Schofield wrote:
> On Fri, Mar 31, 2023 at 11:40:01AM -0700, Dave Jiang wrote:
>>
>>
>> On 3/26/23 10:03 PM, alison.schofield@intel.com wrote:
>>> From: Alison Schofield <alison.schofield@intel.com>
>>>
>>> CXL devices optionally support the CLEAR POISON mailbox command. Add
>>> memdev driver support for clearing poison.
>>>
>>> Per the CXL Specification (3.0 8.2.9.8.4.3), after receiving a valid
>>> clear poison request, the device removes the address from the device's
>>> Poison List and writes 0 (zero) for 64 bytes starting at address. If
>>> the device cannot clear poison from the address, it returns a permanent
>>> media error and -ENXIO is returned to the user.
>>>
>>> Additionally, and per the spec also, it is not an error to clear poison
>>> of an address that is not poisoned. In this case, the device does not
>>> overwrite the address and the device does not return an error.
>>>
>>> If the address is not contained in the device's dpa resource, or is
>>> not 64 byte aligned, return -EINVAL without issuing the mbox command.
>>>
>>> Poison clearing is intended for debug only and will be exposed to
>>> userspace through debugfs. Restrict compilation to CONFIG_DEBUG_FS.
>>>
>>> Implementation note: Although the CXL specification defines the clear
>>> command to accept 64 bytes of 'write-data' to be used when clearing
>>> the poisoned address, this implementation always uses 0 (zeros) for
>>> the write-data.
>>>
>>> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
>>> ---
>>>    drivers/cxl/core/memdev.c | 43 +++++++++++++++++++++++++++++++++++++++
>>>    drivers/cxl/cxlmem.h      |  7 +++++++
>>>    2 files changed, 50 insertions(+)
>>>
>>> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
>>> index 3b3ac2868848..0e39c3c3fb09 100644
>>> --- a/drivers/cxl/core/memdev.c
>>> +++ b/drivers/cxl/core/memdev.c
>>> @@ -268,6 +268,49 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
>>>    }
>>>    EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, CXL);
>>> +int cxl_clear_poison(struct device *dev, u64 dpa)
>>> +{
>>> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
>>> +	struct cxl_mbox_clear_poison clear;
>>> +	struct cxl_mbox_cmd mbox_cmd;
>>> +	int rc;
>>> +
>>> +	if (!IS_ENABLED(CONFIG_DEBUG_FS))
>>> +		return 0;
>>> +
>>> +	down_read(&cxl_dpa_rwsem);
>>> +	rc = cxl_validate_poison_dpa(cxlmd, dpa);
>>> +	if (rc)
>>> +		goto out;
>>> +
>>> +	/*
>>> +	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
>>> +	 * is defined to accept 64 bytes of 'write-data', along with the
>>> +	 * address to clear. The device writes the data into the address
>>> +	 * atomically, while clearing poison if the location is marked as
>>> +	 * being poisoned.
>>> +	 *
>>> +	 * Always use '0' for the write-data.
>>> +	 */
>>> +	clear = (struct cxl_mbox_clear_poison) {
>>> +		.address = cpu_to_le64(dpa)
>>> +	};
>>
>> The write_data[] should be 0s in order to clear the poison right? Since
>> 'clear' is allocated on the stack, if it's not initialized then it would be
>> random garbage in the data. You could just init all 'clear' members when you
>> declare the variable at top if you like.
> 
> Declaring like this initializes any unspecified fields to zero.
> This is the same initialization used across all the mbox_cmd setups
> here and in core/mbox.c.

I thought you need to do:
	clear = (struct cxl_mbox_clear_poison) {
		.address = cpu_to_le64(dpa),
		.write_data = { 0 },
	};

I didn't think it would initialize the other members to 0 if you omit 
them? But my simple C code test seems to indicate otherwise. So sorry 
about the noise.

> 
> Am I using that construct incorrectly?
> 
>>
>> DJ
>>
>>> +
>>> +	mbox_cmd = (struct cxl_mbox_cmd) {
>>> +		.opcode = CXL_MBOX_OP_CLEAR_POISON,
>>> +		.size_in = sizeof(clear),
>>> +		.payload_in = &clear,
>>> +	};
>>> +
>>> +	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
>>> +
>>> +out:
>>> +	up_read(&cxl_dpa_rwsem);
>>> +
>>> +	return rc;
>>> +}
>>> +EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, CXL);
>>> +
>>>    static struct attribute *cxl_memdev_attributes[] = {
>>>    	&dev_attr_serial.attr,
>>>    	&dev_attr_firmware_version.attr,
>>> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>>> index 527efef2d700..1d8677ab2306 100644
>>> --- a/drivers/cxl/cxlmem.h
>>> +++ b/drivers/cxl/cxlmem.h
>>> @@ -607,6 +607,12 @@ struct cxl_mbox_inject_poison {
>>>    	__le64 address;
>>>    };
>>> +/* Clear Poison  CXL 3.0 Spec 8.2.9.8.4.3 */
>>> +struct cxl_mbox_clear_poison {
>>> +	__le64 address;
>>> +	u8 write_data[CXL_POISON_LEN_MULT];
>>> +} __packed;
>>> +
>>>    /**
>>>     * struct cxl_mem_command - Driver representation of a memory device command
>>>     * @info: Command information as it exists for the UAPI
>>> @@ -684,6 +690,7 @@ ssize_t cxl_trigger_poison_list(struct device *dev,
>>>    				struct device_attribute *attr, const char *buf,
>>>    				size_t len);
>>>    int cxl_inject_poison(struct device *dev, u64 dpa);
>>> +int cxl_clear_poison(struct device *dev, u64 dpa);
>>>    #ifdef CONFIG_CXL_SUSPEND
>>>    void cxl_mem_active_inc(void);
Jonathan Cameron April 3, 2023, 2:08 p.m. UTC | #6
On Thu, 30 Mar 2023 13:12:23 -0700
Alison Schofield <alison.schofield@intel.com> wrote:

> On Thu, Mar 30, 2023 at 11:50:18AM -0700, Jonathan Cameron wrote:
> > On Sun, 26 Mar 2023 22:03:08 -0700
> > alison.schofield@intel.com wrote:
> >   
> > > From: Alison Schofield <alison.schofield@intel.com>
> > > 
> > > CXL devices optionally support the CLEAR POISON mailbox command. Add
> > > memdev driver support for clearing poison.
> > > 
> > > Per the CXL Specification (3.0 8.2.9.8.4.3), after receiving a valid
> > > clear poison request, the device removes the address from the device's
> > > Poison List and writes 0 (zero) for 64 bytes starting at address. If
> > > the device cannot clear poison from the address, it returns a permanent
> > > media error and -ENXIO is returned to the user.
> > > 
> > > Additionally, and per the spec also, it is not an error to clear poison
> > > of an address that is not poisoned. In this case, the device does not
> > > overwrite the address and the device does not return an error.  
> > 
> > That's not inline with the spec.
> > 
> > "Clear Poison Write Data: The data the device shall always write into the
> > requested physical address, atomically, while clearing poison if the location
> > is marked as being poisoned."
> > 
> > The overwrite always happens whether or not it's poisoned.  
> 
> Jonathan,
> 
> I read that with an emphasis on that final 'if' clause:
> "The data the device shall always write (...blah blah blah...) if the
> location is marked as being poisoned.
> 
> So, if the location was not marked as being poisoned, the device won't
> write anything.
> 
> Which means, the user cannot use the Clear command to randomly write stuff
> wherever they please.
> 
> What do you think of that ? 

Clarification needed for the spec perhaps.  I'd argue the 'always' is there to make
it clear it does the write whether or not that condition is present. Otherwise
that word has no purpose in the sentence.  Hence the user can write random
data.  They can anyway if they have the ability to inject poison so I don't
see that mattering a lot.

Jonathan


> 
> Alison
> 
> 
> > Other than that
> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> >   
> > > 
> > > If the address is not contained in the device's dpa resource, or is
> > > not 64 byte aligned, return -EINVAL without issuing the mbox command.
> > > 
> > > Poison clearing is intended for debug only and will be exposed to
> > > userspace through debugfs. Restrict compilation to CONFIG_DEBUG_FS.
> > > 
> > > Implementation note: Although the CXL specification defines the clear
> > > command to accept 64 bytes of 'write-data' to be used when clearing
> > > the poisoned address, this implementation always uses 0 (zeros) for
> > > the write-data.
> > > 
> > > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > > ---
> > >  drivers/cxl/core/memdev.c | 43 +++++++++++++++++++++++++++++++++++++++
> > >  drivers/cxl/cxlmem.h      |  7 +++++++
> > >  2 files changed, 50 insertions(+)
> > > 
> > > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > > index 3b3ac2868848..0e39c3c3fb09 100644
> > > --- a/drivers/cxl/core/memdev.c
> > > +++ b/drivers/cxl/core/memdev.c
> > > @@ -268,6 +268,49 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
> > >  }
> > >  EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, CXL);
> > >  
> > > +int cxl_clear_poison(struct device *dev, u64 dpa)
> > > +{
> > > +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> > > +	struct cxl_mbox_clear_poison clear;
> > > +	struct cxl_mbox_cmd mbox_cmd;
> > > +	int rc;
> > > +
> > > +	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> > > +		return 0;
> > > +
> > > +	down_read(&cxl_dpa_rwsem);
> > > +	rc = cxl_validate_poison_dpa(cxlmd, dpa);
> > > +	if (rc)
> > > +		goto out;
> > > +
> > > +	/*
> > > +	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
> > > +	 * is defined to accept 64 bytes of 'write-data', along with the
> > > +	 * address to clear. The device writes the data into the address
> > > +	 * atomically, while clearing poison if the location is marked as
> > > +	 * being poisoned.  
> > 
> > This description is correct.
> >   
> > > +	 *
> > > +	 * Always use '0' for the write-data.
> > > +	 */
> > > +	clear = (struct cxl_mbox_clear_poison) {
> > > +		.address = cpu_to_le64(dpa)
> > > +	};
> > > +
> > > +	mbox_cmd = (struct cxl_mbox_cmd) {
> > > +		.opcode = CXL_MBOX_OP_CLEAR_POISON,
> > > +		.size_in = sizeof(clear),
> > > +		.payload_in = &clear,
> > > +	};
> > > +
> > > +	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> > > +
> > > +out:
> > > +	up_read(&cxl_dpa_rwsem);
> > > +
> > > +	return rc;
> > > +}
> > > +EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, CXL);
> > > +
> > >  static struct attribute *cxl_memdev_attributes[] = {
> > >  	&dev_attr_serial.attr,
> > >  	&dev_attr_firmware_version.attr,
> > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > index 527efef2d700..1d8677ab2306 100644
> > > --- a/drivers/cxl/cxlmem.h
> > > +++ b/drivers/cxl/cxlmem.h
> > > @@ -607,6 +607,12 @@ struct cxl_mbox_inject_poison {
> > >  	__le64 address;
> > >  };
> > >  
> > > +/* Clear Poison  CXL 3.0 Spec 8.2.9.8.4.3 */
> > > +struct cxl_mbox_clear_poison {
> > > +	__le64 address;
> > > +	u8 write_data[CXL_POISON_LEN_MULT];
> > > +} __packed;
> > > +
> > >  /**
> > >   * struct cxl_mem_command - Driver representation of a memory device command
> > >   * @info: Command information as it exists for the UAPI
> > > @@ -684,6 +690,7 @@ ssize_t cxl_trigger_poison_list(struct device *dev,
> > >  				struct device_attribute *attr, const char *buf,
> > >  				size_t len);
> > >  int cxl_inject_poison(struct device *dev, u64 dpa);
> > > +int cxl_clear_poison(struct device *dev, u64 dpa);
> > >  
> > >  #ifdef CONFIG_CXL_SUSPEND
> > >  void cxl_mem_active_inc(void);  
> >
diff mbox series

Patch

diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 3b3ac2868848..0e39c3c3fb09 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -268,6 +268,49 @@  int cxl_inject_poison(struct device *dev, u64 dpa)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, CXL);
 
+int cxl_clear_poison(struct device *dev, u64 dpa)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_mbox_clear_poison clear;
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
+	down_read(&cxl_dpa_rwsem);
+	rc = cxl_validate_poison_dpa(cxlmd, dpa);
+	if (rc)
+		goto out;
+
+	/*
+	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
+	 * is defined to accept 64 bytes of 'write-data', along with the
+	 * address to clear. The device writes the data into the address
+	 * atomically, while clearing poison if the location is marked as
+	 * being poisoned.
+	 *
+	 * Always use '0' for the write-data.
+	 */
+	clear = (struct cxl_mbox_clear_poison) {
+		.address = cpu_to_le64(dpa)
+	};
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_CLEAR_POISON,
+		.size_in = sizeof(clear),
+		.payload_in = &clear,
+	};
+
+	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
+
+out:
+	up_read(&cxl_dpa_rwsem);
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, CXL);
+
 static struct attribute *cxl_memdev_attributes[] = {
 	&dev_attr_serial.attr,
 	&dev_attr_firmware_version.attr,
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 527efef2d700..1d8677ab2306 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -607,6 +607,12 @@  struct cxl_mbox_inject_poison {
 	__le64 address;
 };
 
+/* Clear Poison  CXL 3.0 Spec 8.2.9.8.4.3 */
+struct cxl_mbox_clear_poison {
+	__le64 address;
+	u8 write_data[CXL_POISON_LEN_MULT];
+} __packed;
+
 /**
  * struct cxl_mem_command - Driver representation of a memory device command
  * @info: Command information as it exists for the UAPI
@@ -684,6 +690,7 @@  ssize_t cxl_trigger_poison_list(struct device *dev,
 				struct device_attribute *attr, const char *buf,
 				size_t len);
 int cxl_inject_poison(struct device *dev, u64 dpa);
+int cxl_clear_poison(struct device *dev, u64 dpa);
 
 #ifdef CONFIG_CXL_SUSPEND
 void cxl_mem_active_inc(void);