diff mbox series

[V10,8/9] cxl/port: Retry reading CDAT on failure

Message ID 20220605005049.2155874-9-ira.weiny@intel.com (mailing list archive)
State Superseded
Headers show
Series CXL: Read CDAT and DSMAS data | expand

Commit Message

Ira Weiny June 5, 2022, 12:50 a.m. UTC
From: Ira Weiny <ira.weiny@intel.com>

The CDAT read may fail for a number of reasons but mainly it is possible
to get different parts of a valid state.  The checksum in the CDAT table
protects against this.

Now that the cdat data is validated issue a retries if the CDAT read
fails.  For now 5 retries are implemented.

Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

---
Changes from V9
	Alison Schofield/Davidlohr Bueso
		Print debug on each iteration and error only after failure

Changes from V8
	Move code to cxl/core/pci.c

Changes from V6
	Move to pci.c
	Fix retries count
	Change to 5 retries

Changes from V5:
	New patch -- easy to push off or drop.
---
 drivers/cxl/core/pci.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

Comments

Ben Widawsky June 6, 2022, 6:52 p.m. UTC | #1
On 22-06-04 17:50:48, ira.weiny@intel.com wrote:
> From: Ira Weiny <ira.weiny@intel.com>
> 
> The CDAT read may fail for a number of reasons but mainly it is possible
> to get different parts of a valid state.  The checksum in the CDAT table
> protects against this.
> 
> Now that the cdat data is validated issue a retries if the CDAT read
s/validated issue a retries/validated, issue a retry/
> fails.  For now 5 retries are implemented.
> 
> Cc: Alison Schofield <alison.schofield@intel.com>
> Cc: Davidlohr Bueso <dave@stgolabs.net>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> 
> ---
> Changes from V9
> 	Alison Schofield/Davidlohr Bueso
> 		Print debug on each iteration and error only after failure
> 
> Changes from V8
> 	Move code to cxl/core/pci.c
> 
> Changes from V6
> 	Move to pci.c
> 	Fix retries count
> 	Change to 5 retries
> 
> Changes from V5:
> 	New patch -- easy to push off or drop.
> ---
>  drivers/cxl/core/pci.c | 34 +++++++++++++++++++++++-----------
>  1 file changed, 23 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 73e28b82ffcf..e68f13e66fcf 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -631,20 +631,18 @@ static int cxl_cdat_read_table(struct cxl_port *port,
>  	return rc;
>  }
>  
> -void read_cdat_data(struct cxl_port *port)
> +static int __read_cdat_data(struct cxl_port *port)
>  {
>  	struct device *dev = &port->dev;
>  	size_t cdat_length;
>  	int ret;
>  
>  	if (cxl_cdat_get_length(port, &cdat_length))
> -		return;
> +		return 0;
>  
>  	port->cdat.table = devm_kzalloc(dev, cdat_length, GFP_KERNEL);
> -	if (!port->cdat.table) {
> -		ret = -ENOMEM;
> -		goto error;
> -	}
> +	if (!port->cdat.table)
> +		return -ENOMEM;
>  
>  	port->cdat.length = cdat_length;
>  	ret = cxl_cdat_read_table(port, &port->cdat);
> @@ -652,12 +650,26 @@ void read_cdat_data(struct cxl_port *port)
>  		devm_kfree(dev, port->cdat.table);
>  		port->cdat.table = NULL;
>  		port->cdat.length = 0;
> -		ret = -EIO;
> -		goto error;
> +		return -EIO;
>  	}
>  
> -	return;
> -error:
> -	dev_err(dev, "CDAT data read error (%d)\n", ret);
> +	return 0;
> +}
> +
> +void read_cdat_data(struct cxl_port *port)
> +{
> +	int retries = 5;
> +	int rc;
> +
> +	while (retries--) {
> +		rc = __read_cdat_data(port);
> +		if (!rc)
> +			return;
> +		dev_dbg(&port->dev,
> +			"CDAT data read error rc=%d (retries %d)\n",
> +			rc, retries);

Out of curiousity, what is the purpose of the dev_dbg? To diagnose delays or
something?

> +	}
> +	dev_err(&port->dev, "CDAT data read failed after %d retries\n",
> +		retries);


Reviewed-by: Ben Widawsky <bwidawsk@kernel.org>

>  }
>  EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
> -- 
> 2.35.1
>
Ira Weiny June 8, 2022, 11:07 p.m. UTC | #2
On Mon, Jun 06, 2022 at 11:52:03AM -0700, Ben Widawsky wrote:
> On 22-06-04 17:50:48, ira.weiny@intel.com wrote:
> > From: Ira Weiny <ira.weiny@intel.com>
> > 
> > The CDAT read may fail for a number of reasons but mainly it is possible
> > to get different parts of a valid state.  The checksum in the CDAT table
> > protects against this.
> > 
> > Now that the cdat data is validated issue a retries if the CDAT read
> s/validated issue a retries/validated, issue a retry/

Thanks!

> > fails.  For now 5 retries are implemented.
> > 
> > Cc: Alison Schofield <alison.schofield@intel.com>
> > Cc: Davidlohr Bueso <dave@stgolabs.net>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > 
> > ---
> > Changes from V9
> > 	Alison Schofield/Davidlohr Bueso
> > 		Print debug on each iteration and error only after failure
> > 
> > Changes from V8
> > 	Move code to cxl/core/pci.c
> > 
> > Changes from V6
> > 	Move to pci.c
> > 	Fix retries count
> > 	Change to 5 retries
> > 
> > Changes from V5:
> > 	New patch -- easy to push off or drop.
> > ---
> >  drivers/cxl/core/pci.c | 34 +++++++++++++++++++++++-----------
> >  1 file changed, 23 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> > index 73e28b82ffcf..e68f13e66fcf 100644
> > --- a/drivers/cxl/core/pci.c
> > +++ b/drivers/cxl/core/pci.c
> > @@ -631,20 +631,18 @@ static int cxl_cdat_read_table(struct cxl_port *port,
> >  	return rc;
> >  }
> >  
> > -void read_cdat_data(struct cxl_port *port)
> > +static int __read_cdat_data(struct cxl_port *port)
> >  {
> >  	struct device *dev = &port->dev;
> >  	size_t cdat_length;
> >  	int ret;
> >  
> >  	if (cxl_cdat_get_length(port, &cdat_length))
> > -		return;
> > +		return 0;
> >  
> >  	port->cdat.table = devm_kzalloc(dev, cdat_length, GFP_KERNEL);
> > -	if (!port->cdat.table) {
> > -		ret = -ENOMEM;
> > -		goto error;
> > -	}
> > +	if (!port->cdat.table)
> > +		return -ENOMEM;
> >  
> >  	port->cdat.length = cdat_length;
> >  	ret = cxl_cdat_read_table(port, &port->cdat);
> > @@ -652,12 +650,26 @@ void read_cdat_data(struct cxl_port *port)
> >  		devm_kfree(dev, port->cdat.table);
> >  		port->cdat.table = NULL;
> >  		port->cdat.length = 0;
> > -		ret = -EIO;
> > -		goto error;
> > +		return -EIO;
> >  	}
> >  
> > -	return;
> > -error:
> > -	dev_err(dev, "CDAT data read error (%d)\n", ret);
> > +	return 0;
> > +}
> > +
> > +void read_cdat_data(struct cxl_port *port)
> > +{
> > +	int retries = 5;
> > +	int rc;
> > +
> > +	while (retries--) {
> > +		rc = __read_cdat_data(port);
> > +		if (!rc)
> > +			return;
> > +		dev_dbg(&port->dev,
> > +			"CDAT data read error rc=%d (retries %d)\n",
> > +			rc, retries);
> 
> Out of curiousity, what is the purpose of the dev_dbg? To diagnose delays or
> something?

Yes

> 
> > +	}
> > +	dev_err(&port->dev, "CDAT data read failed after %d retries\n",
> > +		retries);
> 
> 
> Reviewed-by: Ben Widawsky <bwidawsk@kernel.org>

Thanks!
Ira

> 
> >  }
> >  EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
> > -- 
> > 2.35.1
> >
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 73e28b82ffcf..e68f13e66fcf 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -631,20 +631,18 @@  static int cxl_cdat_read_table(struct cxl_port *port,
 	return rc;
 }
 
-void read_cdat_data(struct cxl_port *port)
+static int __read_cdat_data(struct cxl_port *port)
 {
 	struct device *dev = &port->dev;
 	size_t cdat_length;
 	int ret;
 
 	if (cxl_cdat_get_length(port, &cdat_length))
-		return;
+		return 0;
 
 	port->cdat.table = devm_kzalloc(dev, cdat_length, GFP_KERNEL);
-	if (!port->cdat.table) {
-		ret = -ENOMEM;
-		goto error;
-	}
+	if (!port->cdat.table)
+		return -ENOMEM;
 
 	port->cdat.length = cdat_length;
 	ret = cxl_cdat_read_table(port, &port->cdat);
@@ -652,12 +650,26 @@  void read_cdat_data(struct cxl_port *port)
 		devm_kfree(dev, port->cdat.table);
 		port->cdat.table = NULL;
 		port->cdat.length = 0;
-		ret = -EIO;
-		goto error;
+		return -EIO;
 	}
 
-	return;
-error:
-	dev_err(dev, "CDAT data read error (%d)\n", ret);
+	return 0;
+}
+
+void read_cdat_data(struct cxl_port *port)
+{
+	int retries = 5;
+	int rc;
+
+	while (retries--) {
+		rc = __read_cdat_data(port);
+		if (!rc)
+			return;
+		dev_dbg(&port->dev,
+			"CDAT data read error rc=%d (retries %d)\n",
+			rc, retries);
+	}
+	dev_err(&port->dev, "CDAT data read failed after %d retries\n",
+		retries);
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);