diff mbox series

[v5,03/12] cxl/memdev: Warn of poison inject or clear to a mapped region

Message ID fc7db7de4778803a3221d7fd5a203bf971fdac61.1679892337.git.alison.schofield@intel.com
State Superseded
Headers show
Series cxl: CXL Inject & Clear Poison | expand

Commit Message

Alison Schofield March 27, 2023, 5:03 a.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

Inject and clear poison capabilities and intended for debug usage only.
In order to be useful in debug environments, the driver needs to allow
inject and clear operations on DPAs mapped in regions.

dev_warn_once() when either operation occurs.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 drivers/cxl/core/memdev.c | 59 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

Comments

Jonathan Cameron March 30, 2023, 6:55 p.m. UTC | #1
On Sun, 26 Mar 2023 22:03:09 -0700
alison.schofield@intel.com wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> Inject and clear poison capabilities and intended for debug usage only.
> In order to be useful in debug environments, the driver needs to allow
> inject and clear operations on DPAs mapped in regions.
> 
> dev_warn_once() when either operation occurs.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/cxl/core/memdev.c | 59 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 59 insertions(+)
> 
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index 0e39c3c3fb09..a83619c31f61 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -213,6 +213,50 @@ ssize_t cxl_trigger_poison_list(struct device *dev,
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_trigger_poison_list, CXL);
>  
> +struct cxl_dpa_to_region_context {
> +	struct cxl_region *cxlr;
> +	u64 dpa;
> +};
> +
> +static int __cxl_dpa_to_region(struct device *dev, void *arg)
> +{
> +	struct cxl_dpa_to_region_context *ctx = arg;
> +	struct cxl_endpoint_decoder *cxled;
> +	u64 dpa = ctx->dpa;
> +
> +	if (!is_endpoint_decoder(dev))
> +		return 0;
> +
> +	cxled = to_cxl_endpoint_decoder(dev);
> +	if (!cxled->dpa_res || !resource_size(cxled->dpa_res))
> +		return 0;
> +
> +	if (dpa > cxled->dpa_res->end || dpa < cxled->dpa_res->start)
> +		return 0;
> +
> +	dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa,
> +		dev_name(&cxled->cxld.region->dev));
> +
> +	ctx->cxlr = cxled->cxld.region;
> +
If we have a match, little point in letting walk continue.

return 1;

Also, I "think" we just know that the association has been built.
Injecting poison is probably still fine if the region / decoder hasn't yet
been committed.

Jonathan


> +	return 0;
> +}
> +
> +static struct cxl_region *cxl_dpa_to_region(struct cxl_memdev *cxlmd, u64 dpa)
> +{
> +	struct cxl_dpa_to_region_context ctx;
> +	struct cxl_port *port;
> +
> +	ctx = (struct cxl_dpa_to_region_context) {
> +		.dpa = dpa,
> +	};
> +	port = dev_get_drvdata(&cxlmd->dev);
> +	if (port && is_cxl_endpoint(port) && port->commit_end != -1)
> +		device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
> +
> +	return ctx.cxlr;
> +}
> +
>  static int cxl_validate_poison_dpa(struct cxl_memdev *cxlmd, u64 dpa)
>  {
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> @@ -242,6 +286,7 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
>  	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
>  	struct cxl_mbox_inject_poison inject;
>  	struct cxl_mbox_cmd mbox_cmd;
> +	struct cxl_region *cxlr;
>  	int rc;
>  
>  	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> @@ -261,6 +306,13 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
>  		.payload_in = &inject,
>  	};
>  	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> +	if (rc)
> +		goto out;
> +
> +	cxlr = cxl_dpa_to_region(cxlmd, dpa);
> +	if (cxlr)
> +		dev_warn_once(dev, "poison inject dpa:0x%llx region: %s\n",
> +			      dpa, dev_name(&cxlr->dev));
>  out:
>  	up_read(&cxl_dpa_rwsem);
>  
> @@ -273,6 +325,7 @@ int cxl_clear_poison(struct device *dev, u64 dpa)
>  	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
>  	struct cxl_mbox_clear_poison clear;
>  	struct cxl_mbox_cmd mbox_cmd;
> +	struct cxl_region *cxlr;
>  	int rc;
>  
>  	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> @@ -303,7 +356,13 @@ int cxl_clear_poison(struct device *dev, u64 dpa)
>  	};
>  
>  	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> +	if (rc)
> +		goto out;
>  
> +	cxlr = cxl_dpa_to_region(cxlmd, dpa);
> +	if (cxlr)
> +		dev_warn_once(dev, "poison clear dpa:0x%llx region: %s\n",
> +			      dpa, dev_name(&cxlr->dev));
>  out:
>  	up_read(&cxl_dpa_rwsem);
>
Alison Schofield April 11, 2023, 5:43 p.m. UTC | #2
On Thu, Mar 30, 2023 at 07:55:46PM +0100, Jonathan Cameron wrote:
> On Sun, 26 Mar 2023 22:03:09 -0700
> alison.schofield@intel.com wrote:
> 
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > Inject and clear poison capabilities and intended for debug usage only.
> > In order to be useful in debug environments, the driver needs to allow
> > inject and clear operations on DPAs mapped in regions.
> > 
> > dev_warn_once() when either operation occurs.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > ---
> >  drivers/cxl/core/memdev.c | 59 +++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 59 insertions(+)
> > 
> > diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> > index 0e39c3c3fb09..a83619c31f61 100644
> > --- a/drivers/cxl/core/memdev.c
> > +++ b/drivers/cxl/core/memdev.c

snip

> > +static int __cxl_dpa_to_region(struct device *dev, void *arg)
> > +{
> > +	struct cxl_dpa_to_region_context *ctx = arg;
> > +	struct cxl_endpoint_decoder *cxled;
> > +	u64 dpa = ctx->dpa;
> > +
> > +	if (!is_endpoint_decoder(dev))
> > +		return 0;
> > +
> > +	cxled = to_cxl_endpoint_decoder(dev);
> > +	if (!cxled->dpa_res || !resource_size(cxled->dpa_res))
> > +		return 0;
> > +
> > +	if (dpa > cxled->dpa_res->end || dpa < cxled->dpa_res->start)
> > +		return 0;
> > +
> > +	dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa,
> > +		dev_name(&cxled->cxld.region->dev));
> > +
> > +	ctx->cxlr = cxled->cxld.region;
> > +
> If we have a match, little point in letting walk continue.
> 
> return 1;

Yes, thanks!  Returning 1 now to stop the walk.

> 
> Also, I "think" we just know that the association has been built.
> Injecting poison is probably still fine if the region / decoder hasn't yet
> been committed.

I think you are right. If we want to allow inject in the space between
mapping and commit, then this work needs to move to the region driver,
similar to how cxl_get_poison_by_endpoint() in the get poison list
series works.

I'm not seeing how injecting poison in that gap, would be an important
debug scenario. Is it?

Alison

> 
> Jonathan
> 
> 
> > +	return 0;
> > +}
> > +
> > +static struct cxl_region *cxl_dpa_to_region(struct cxl_memdev *cxlmd, u64 dpa)
> > +{
> > +	struct cxl_dpa_to_region_context ctx;
> > +	struct cxl_port *port;
> > +
> > +	ctx = (struct cxl_dpa_to_region_context) {
> > +		.dpa = dpa,
> > +	};
> > +	port = dev_get_drvdata(&cxlmd->dev);
> > +	if (port && is_cxl_endpoint(port) && port->commit_end != -1)
> > +		device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
> > +
> > +	return ctx.cxlr;
> > +}
> > +
> >  static int cxl_validate_poison_dpa(struct cxl_memdev *cxlmd, u64 dpa)
> >  {
> >  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > @@ -242,6 +286,7 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
> >  	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> >  	struct cxl_mbox_inject_poison inject;
> >  	struct cxl_mbox_cmd mbox_cmd;
> > +	struct cxl_region *cxlr;
> >  	int rc;
> >  
> >  	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> > @@ -261,6 +306,13 @@ int cxl_inject_poison(struct device *dev, u64 dpa)
> >  		.payload_in = &inject,
> >  	};
> >  	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> > +	if (rc)
> > +		goto out;
> > +
> > +	cxlr = cxl_dpa_to_region(cxlmd, dpa);
> > +	if (cxlr)
> > +		dev_warn_once(dev, "poison inject dpa:0x%llx region: %s\n",
> > +			      dpa, dev_name(&cxlr->dev));
> >  out:
> >  	up_read(&cxl_dpa_rwsem);
> >  
> > @@ -273,6 +325,7 @@ int cxl_clear_poison(struct device *dev, u64 dpa)
> >  	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> >  	struct cxl_mbox_clear_poison clear;
> >  	struct cxl_mbox_cmd mbox_cmd;
> > +	struct cxl_region *cxlr;
> >  	int rc;
> >  
> >  	if (!IS_ENABLED(CONFIG_DEBUG_FS))
> > @@ -303,7 +356,13 @@ int cxl_clear_poison(struct device *dev, u64 dpa)
> >  	};
> >  
> >  	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
> > +	if (rc)
> > +		goto out;
> >  
> > +	cxlr = cxl_dpa_to_region(cxlmd, dpa);
> > +	if (cxlr)
> > +		dev_warn_once(dev, "poison clear dpa:0x%llx region: %s\n",
> > +			      dpa, dev_name(&cxlr->dev));
> >  out:
> >  	up_read(&cxl_dpa_rwsem);
> >  
>
Jonathan Cameron April 13, 2023, 5:07 p.m. UTC | #3
> > Also, I "think" we just know that the association has been built.
> > Injecting poison is probably still fine if the region / decoder hasn't yet
> > been committed.  
> 
> I think you are right. If we want to allow inject in the space between
> mapping and commit, then this work needs to move to the region driver,
> similar to how cxl_get_poison_by_endpoint() in the get poison list
> series works.
> 
> I'm not seeing how injecting poison in that gap, would be an important
> debug scenario. Is it?
> 

Probably not ;) Maybe a comment to say that this is being conservative by
preventing it earlier than strictly necessary.

If this merged whilst I wasn't paying attention no need to add one.
diff mbox series

Patch

diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 0e39c3c3fb09..a83619c31f61 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -213,6 +213,50 @@  ssize_t cxl_trigger_poison_list(struct device *dev,
 }
 EXPORT_SYMBOL_NS_GPL(cxl_trigger_poison_list, CXL);
 
+struct cxl_dpa_to_region_context {
+	struct cxl_region *cxlr;
+	u64 dpa;
+};
+
+static int __cxl_dpa_to_region(struct device *dev, void *arg)
+{
+	struct cxl_dpa_to_region_context *ctx = arg;
+	struct cxl_endpoint_decoder *cxled;
+	u64 dpa = ctx->dpa;
+
+	if (!is_endpoint_decoder(dev))
+		return 0;
+
+	cxled = to_cxl_endpoint_decoder(dev);
+	if (!cxled->dpa_res || !resource_size(cxled->dpa_res))
+		return 0;
+
+	if (dpa > cxled->dpa_res->end || dpa < cxled->dpa_res->start)
+		return 0;
+
+	dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa,
+		dev_name(&cxled->cxld.region->dev));
+
+	ctx->cxlr = cxled->cxld.region;
+
+	return 0;
+}
+
+static struct cxl_region *cxl_dpa_to_region(struct cxl_memdev *cxlmd, u64 dpa)
+{
+	struct cxl_dpa_to_region_context ctx;
+	struct cxl_port *port;
+
+	ctx = (struct cxl_dpa_to_region_context) {
+		.dpa = dpa,
+	};
+	port = dev_get_drvdata(&cxlmd->dev);
+	if (port && is_cxl_endpoint(port) && port->commit_end != -1)
+		device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
+
+	return ctx.cxlr;
+}
+
 static int cxl_validate_poison_dpa(struct cxl_memdev *cxlmd, u64 dpa)
 {
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
@@ -242,6 +286,7 @@  int cxl_inject_poison(struct device *dev, u64 dpa)
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 	struct cxl_mbox_inject_poison inject;
 	struct cxl_mbox_cmd mbox_cmd;
+	struct cxl_region *cxlr;
 	int rc;
 
 	if (!IS_ENABLED(CONFIG_DEBUG_FS))
@@ -261,6 +306,13 @@  int cxl_inject_poison(struct device *dev, u64 dpa)
 		.payload_in = &inject,
 	};
 	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
+	if (rc)
+		goto out;
+
+	cxlr = cxl_dpa_to_region(cxlmd, dpa);
+	if (cxlr)
+		dev_warn_once(dev, "poison inject dpa:0x%llx region: %s\n",
+			      dpa, dev_name(&cxlr->dev));
 out:
 	up_read(&cxl_dpa_rwsem);
 
@@ -273,6 +325,7 @@  int cxl_clear_poison(struct device *dev, u64 dpa)
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 	struct cxl_mbox_clear_poison clear;
 	struct cxl_mbox_cmd mbox_cmd;
+	struct cxl_region *cxlr;
 	int rc;
 
 	if (!IS_ENABLED(CONFIG_DEBUG_FS))
@@ -303,7 +356,13 @@  int cxl_clear_poison(struct device *dev, u64 dpa)
 	};
 
 	rc = cxl_internal_send_cmd(cxlmd->cxlds, &mbox_cmd);
+	if (rc)
+		goto out;
 
+	cxlr = cxl_dpa_to_region(cxlmd, dpa);
+	if (cxlr)
+		dev_warn_once(dev, "poison clear dpa:0x%llx region: %s\n",
+			      dpa, dev_name(&cxlr->dev));
 out:
 	up_read(&cxl_dpa_rwsem);