diff mbox

hwmon: pmbus: Make reg check and clear faults functions return errors

Message ID 20170905070132.17682-1-andrew@aj.id.au (mailing list archive)
State Rejected
Headers show

Commit Message

Andrew Jeffery Sept. 5, 2017, 7:01 a.m. UTC
Some functions exposed by pmbus core conflated errors that occurred when
setting the page to access with errors that occurred when accessing
registers in a page. In some cases, this caused legitimate errors to be
hidden under the guise of the register not being supported.

Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
---
 Documentation/hwmon/pmbus-core   |  12 ++--
 drivers/hwmon/pmbus/pmbus.h      |   6 +-
 drivers/hwmon/pmbus/pmbus_core.c | 115 +++++++++++++++++++++++++++++----------
 3 files changed, 95 insertions(+), 38 deletions(-)

Comments

Guenter Roeck Sept. 5, 2017, 5 p.m. UTC | #1
On Tue, Sep 05, 2017 at 05:01:32PM +1000, Andrew Jeffery wrote:
> Some functions exposed by pmbus core conflated errors that occurred when
> setting the page to access with errors that occurred when accessing
> registers in a page. In some cases, this caused legitimate errors to be
> hidden under the guise of the register not being supported.
> 

I'll have to look into this. If I recall correctly, the reason for not returning
errors from the "clear faults" command was that some early chips did not support
it, or did not support it on all pages. Those chips would now always fail.

On a higher level, I am not sure if it is a good idea to return an error
from a function intended to clear faults (and nothing else). That seems
counterproductive.

Is this a problem you have actually observed ? If so, what is the chip ?

Thanks,
Guenter

> Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
> ---
>  Documentation/hwmon/pmbus-core   |  12 ++--
>  drivers/hwmon/pmbus/pmbus.h      |   6 +-
>  drivers/hwmon/pmbus/pmbus_core.c | 115 +++++++++++++++++++++++++++++----------
>  3 files changed, 95 insertions(+), 38 deletions(-)
> 
> diff --git a/Documentation/hwmon/pmbus-core b/Documentation/hwmon/pmbus-core
> index 8ed10e9ddfb5..3e9f41bb756f 100644
> --- a/Documentation/hwmon/pmbus-core
> +++ b/Documentation/hwmon/pmbus-core
> @@ -218,17 +218,17 @@ Specifically, it provides the following information.
>    This function calls the device specific write_byte function if defined.
>    Therefore, it must _not_ be called from that function.
>  
> -  bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> +  int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
>  
> -  Check if byte register exists. Return true if the register exists, false
> -  otherwise.
> +  Check if byte register exists. Returns 1 if the register exists, 0 if it does
> +  not, and less than zero on an unexpected error.
>    This function calls the device specific write_byte function if defined to
>    obtain the chip status. Therefore, it must _not_ be called from that function.
>  
> -  bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> +  int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
>  
> -  Check if word register exists. Return true if the register exists, false
> -  otherwise.
> +  Check if word register exists. Returns 1 if the register exists, 0 if it does
> +  not, and less than zero on an unexpected error.
>    This function calls the device specific write_byte function if defined to
>    obtain the chip status. Therefore, it must _not_ be called from that function.
>  
> diff --git a/drivers/hwmon/pmbus/pmbus.h b/drivers/hwmon/pmbus/pmbus.h
> index bfcb13bae34b..c53032a04a6f 100644
> --- a/drivers/hwmon/pmbus/pmbus.h
> +++ b/drivers/hwmon/pmbus/pmbus.h
> @@ -413,9 +413,9 @@ int pmbus_write_byte_data(struct i2c_client *client, int page, u8 reg,
>  			  u8 value);
>  int pmbus_update_byte_data(struct i2c_client *client, int page, u8 reg,
>  			   u8 mask, u8 value);
> -void pmbus_clear_faults(struct i2c_client *client);
> -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> +int pmbus_clear_faults(struct i2c_client *client);
> +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> +int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
>  int pmbus_do_probe(struct i2c_client *client, const struct i2c_device_id *id,
>  		   struct pmbus_driver_info *info);
>  int pmbus_do_remove(struct i2c_client *client);
> diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
> index f1eff6b6c798..153700e35431 100644
> --- a/drivers/hwmon/pmbus/pmbus_core.c
> +++ b/drivers/hwmon/pmbus/pmbus_core.c
> @@ -304,18 +304,24 @@ static int _pmbus_read_byte_data(struct i2c_client *client, int page, int reg)
>  	return pmbus_read_byte_data(client, page, reg);
>  }
>  
> -static void pmbus_clear_fault_page(struct i2c_client *client, int page)
> +static int pmbus_clear_fault_page(struct i2c_client *client, int page)
>  {
> -	_pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
> +	return _pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
>  }
>  
> -void pmbus_clear_faults(struct i2c_client *client)
> +int pmbus_clear_faults(struct i2c_client *client)
>  {
>  	struct pmbus_data *data = i2c_get_clientdata(client);
> +	int rv;
>  	int i;
>  
> -	for (i = 0; i < data->info->pages; i++)
> -		pmbus_clear_fault_page(client, i);
> +	for (i = 0; i < data->info->pages; i++) {
> +		rv = pmbus_clear_fault_page(client, i);
> +		if (rv)
> +			return rv;
> +	}
> +
> +	return 0;
>  }
>  EXPORT_SYMBOL_GPL(pmbus_clear_faults);
>  
> @@ -333,28 +339,45 @@ static int pmbus_check_status_cml(struct i2c_client *client)
>  	return 0;
>  }
>  
> -static bool pmbus_check_register(struct i2c_client *client,
> +static int pmbus_check_register(struct i2c_client *client,
>  				 int (*func)(struct i2c_client *client,
>  					     int page, int reg),
>  				 int page, int reg)
>  {
> +	struct pmbus_data *data;
> +	int check;
>  	int rv;
> -	struct pmbus_data *data = i2c_get_clientdata(client);
>  
> -	rv = func(client, page, reg);
> -	if (rv >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
> -		rv = pmbus_check_status_cml(client);
> -	pmbus_clear_fault_page(client, -1);
> -	return rv >= 0;
> +	data = i2c_get_clientdata(client);
> +
> +	/*
> +	 * pmbus_set_page() guards transactions on the requested page matching
> +	 * the current page. This may be done in the execution of func(), but
> +	 * at that point a set-page error is conflated with accessing a
> +	 * non-existent register.
> +	 */
> +	rv = pmbus_set_page(client, page);
> +	if (rv < 0)
> +		return rv;
> +
> +	check = func(client, page, reg);
> +	if (check >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
> +		check = pmbus_check_status_cml(client);
> +
> +	rv = pmbus_clear_fault_page(client, -1);
> +	if (rv < 0)
> +		return rv;
> +
> +	return check >= 0;
>  }
>  
> -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
> +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
>  {
>  	return pmbus_check_register(client, _pmbus_read_byte_data, page, reg);
>  }
>  EXPORT_SYMBOL_GPL(pmbus_check_byte_register);
>  
> -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg)
> +int pmbus_check_word_register(struct i2c_client *client, int page, int reg)
>  {
>  	return pmbus_check_register(client, _pmbus_read_word_data, page, reg);
>  }
> @@ -390,7 +413,7 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
>  
>  	mutex_lock(&data->update_lock);
>  	if (time_after(jiffies, data->last_updated + HZ) || !data->valid) {
> -		int i, j;
> +		int i, j, ret;
>  
>  		for (i = 0; i < info->pages; i++) {
>  			data->status[PB_STATUS_BASE + i]
> @@ -424,7 +447,13 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
>  							    sensor->page,
>  							    sensor->reg);
>  		}
> -		pmbus_clear_faults(client);
> +
> +		ret = pmbus_clear_faults(client);
> +		if (ret < 0) {
> +			mutex_unlock(&data->update_lock);
> +			return ERR_PTR(ret);
> +		}
> +
>  		data->last_updated = jiffies;
>  		data->valid = 1;
>  	}
> @@ -754,6 +783,9 @@ static ssize_t pmbus_show_boolean(struct device *dev,
>  	struct pmbus_data *data = pmbus_update_device(dev);
>  	int val;
>  
> +	if (IS_ERR(data))
> +		return PTR_ERR(data);
> +
>  	val = pmbus_get_boolean(data, boolean, attr->index);
>  	if (val < 0)
>  		return val;
> @@ -766,6 +798,9 @@ static ssize_t pmbus_show_sensor(struct device *dev,
>  	struct pmbus_data *data = pmbus_update_device(dev);
>  	struct pmbus_sensor *sensor = to_pmbus_sensor(devattr);
>  
> +	if (IS_ERR(data))
> +		return PTR_ERR(data);
> +
>  	if (sensor->data < 0)
>  		return sensor->data;
>  
> @@ -995,7 +1030,11 @@ static int pmbus_add_limit_attrs(struct i2c_client *client,
>  	struct pmbus_sensor *curr;
>  
>  	for (i = 0; i < nlimit; i++) {
> -		if (pmbus_check_word_register(client, page, l->reg)) {
> +		ret = pmbus_check_word_register(client, page, l->reg);
> +		if (ret < 0)
> +			return ret;
> +
> +		if (ret) {
>  			curr = pmbus_add_sensor(data, name, l->attr, index,
>  						page, l->reg, attr->class,
>  						attr->update || l->update,
> @@ -1041,6 +1080,8 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
>  	if (!base)
>  		return -ENOMEM;
>  	if (attr->sfunc) {
> +		int check;
> +
>  		ret = pmbus_add_limit_attrs(client, data, info, name,
>  					    index, page, base, attr);
>  		if (ret < 0)
> @@ -1050,9 +1091,13 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
>  		 * alarm attributes, if there is a global alarm bit, and if
>  		 * the generic status register for this page is accessible.
>  		 */
> -		if (!ret && attr->gbit &&
> -		    pmbus_check_byte_register(client, page,
> -					      data->status_register)) {
> +
> +		check = pmbus_check_byte_register(client, page,
> +						  data->status_register);
> +		if (check < 0)
> +			return check;
> +
> +		if (!ret && attr->gbit && check) {
>  			ret = pmbus_add_boolean(data, name, "alarm", index,
>  						NULL, NULL,
>  						PB_STATUS_BASE + page,
> @@ -1604,8 +1649,12 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
>  			if (!(info->func[page] & pmbus_fan_flags[f]))
>  				break;
>  
> -			if (!pmbus_check_word_register(client, page,
> -						       pmbus_fan_registers[f]))
> +			ret = pmbus_check_word_register(client, page,
> +						       pmbus_fan_registers[f]);
> +			if (ret < 0)
> +				return ret;
> +
> +			if (!ret)
>  				break;
>  
>  			/*
> @@ -1628,9 +1677,13 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
>  			 * Each fan status register covers multiple fans,
>  			 * so we have to do some magic.
>  			 */
> -			if ((info->func[page] & pmbus_fan_status_flags[f]) &&
> -			    pmbus_check_byte_register(client,
> -					page, pmbus_fan_status_registers[f])) {
> +			ret =  pmbus_check_byte_register(client, page,
> +						pmbus_fan_status_registers[f]);
> +			if (ret < 0)
> +				return ret;
> +
> +			if ((info->func[page] & pmbus_fan_status_flags[f])
> +					&& ret) {
>  				int base;
>  
>  				if (f > 1)	/* fan 3, 4 */
> @@ -1696,10 +1749,13 @@ static int pmbus_identify_common(struct i2c_client *client,
>  				 struct pmbus_data *data, int page)
>  {
>  	int vout_mode = -1;
> +	int rv;
>  
> -	if (pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE))
> +	rv = pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE);
> +	if (rv == 1)
>  		vout_mode = _pmbus_read_byte_data(client, page,
>  						  PMBUS_VOUT_MODE);
> +
>  	if (vout_mode >= 0 && vout_mode != 0xff) {
>  		/*
>  		 * Not all chips support the VOUT_MODE command,
> @@ -1725,8 +1781,7 @@ static int pmbus_identify_common(struct i2c_client *client,
>  		}
>  	}
>  
> -	pmbus_clear_fault_page(client, page);
> -	return 0;
> +	return pmbus_clear_fault_page(client, page);
>  }
>  
>  static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
> @@ -1756,7 +1811,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
>  	if (ret >= 0 && (ret & PB_CAPABILITY_ERROR_CHECK))
>  		client->flags |= I2C_CLIENT_PEC;
>  
> -	pmbus_clear_faults(client);
> +	ret = pmbus_clear_faults(client);
> +	if (ret < 0)
> +		return ret;
>  
>  	if (info->identify) {
>  		ret = (*info->identify)(client, info);
> -- 
> 2.11.0
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jeffery Sept. 6, 2017, 12:23 a.m. UTC | #2
On Tue, 2017-09-05 at 10:00 -0700, Guenter Roeck wrote:
> On Tue, Sep 05, 2017 at 05:01:32PM +1000, Andrew Jeffery wrote:
> > Some functions exposed by pmbus core conflated errors that occurred when
> > setting the page to access with errors that occurred when accessing
> > registers in a page. In some cases, this caused legitimate errors to be
> > hidden under the guise of the register not being supported.
> > 
> 
> I'll have to look into this. If I recall correctly, the reason for not returning
> errors from the "clear faults" command was that some early chips did not support
> it, or did not support it on all pages. Those chips would now always fail.

Yes, that is a concern.

However, shouldn't the lack of support for CLEAR_FAULTS be a described
property of the chip or page?

Regardless, the issue here is the PAGE command is sometimes failing
(more below). This means we won't have issued a CLEAR_FAULTS against
the page even if the page supports CLEAR_FAULTS. That to me is
something that should be propagated back up the call chain, as faults
that can be cleared will not have been.

> 
> On a higher level, I am not sure if it is a good idea to return an error
> from a function intended to clear faults (and nothing else). That seems
> counterproductive.
> 
> Is this a problem you have actually observed ? 

Unfortunately yes. I was trying to reproduce some issues that we are
seeing in userspace but wasn't having much luck (getting -EIO on
reading a hwmon attribute). I ended up instrumenting our I2C bus
driver, and found that the problem was more prevalent than what the
read failures in userspace were indicating. The paths that were
triggering these unreported conditions all traced through the check
register and clear fault functions, which on analysis were swallowing
the error.

> If so, what is the chip ?

Well, the chip that I'm *communicating* with is the Maxim MAX31785
Intelligent Fan Controller. As to whether that's what is *causing* the
PAGE command to fail is still up in the air. I would not be surprised
if we have other issues in the hardware design.

I haven't sent a follow-up to the series introducing the MAX31785
driver because I've been chasing down communication issues. I felt it
was important to understand whether the device has quirks that need to
be worked around in the driver, or if our hardware design has bigger
problems that should be handled in other ways. I've been in touch with
Maxim who have asserted that some of the problems we're seeing cannot
be caused by their chip.

Andrew

> 
> Thanks,
> Guenter
> 
> > > > Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
> > ---
> >  Documentation/hwmon/pmbus-core   |  12 ++--
> >  drivers/hwmon/pmbus/pmbus.h      |   6 +-
> >  drivers/hwmon/pmbus/pmbus_core.c | 115 +++++++++++++++++++++++++++++----------
> >  3 files changed, 95 insertions(+), 38 deletions(-)
> > 
> > diff --git a/Documentation/hwmon/pmbus-core b/Documentation/hwmon/pmbus-core
> > index 8ed10e9ddfb5..3e9f41bb756f 100644
> > --- a/Documentation/hwmon/pmbus-core
> > +++ b/Documentation/hwmon/pmbus-core
> > @@ -218,17 +218,17 @@ Specifically, it provides the following information.
> >    This function calls the device specific write_byte function if defined.
> >    Therefore, it must _not_ be called from that function.
> >  
> > -  bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > +  int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> >  
> > -  Check if byte register exists. Return true if the register exists, false
> > -  otherwise.
> > +  Check if byte register exists. Returns 1 if the register exists, 0 if it does
> > +  not, and less than zero on an unexpected error.
> >    This function calls the device specific write_byte function if defined to
> >    obtain the chip status. Therefore, it must _not_ be called from that function.
> >  
> > -  bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > +  int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> >  
> > -  Check if word register exists. Return true if the register exists, false
> > -  otherwise.
> > +  Check if word register exists. Returns 1 if the register exists, 0 if it does
> > +  not, and less than zero on an unexpected error.
> >    This function calls the device specific write_byte function if defined to
> >    obtain the chip status. Therefore, it must _not_ be called from that function.
> >  
> > diff --git a/drivers/hwmon/pmbus/pmbus.h b/drivers/hwmon/pmbus/pmbus.h
> > index bfcb13bae34b..c53032a04a6f 100644
> > --- a/drivers/hwmon/pmbus/pmbus.h
> > +++ b/drivers/hwmon/pmbus/pmbus.h
> > @@ -413,9 +413,9 @@ int pmbus_write_byte_data(struct i2c_client *client, int page, u8 reg,
> > > >  			  u8 value);
> >  int pmbus_update_byte_data(struct i2c_client *client, int page, u8 reg,
> > > >  			   u8 mask, u8 value);
> > -void pmbus_clear_faults(struct i2c_client *client);
> > -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > +int pmbus_clear_faults(struct i2c_client *client);
> > +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > +int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> >  int pmbus_do_probe(struct i2c_client *client, const struct i2c_device_id *id,
> > > >  		   struct pmbus_driver_info *info);
> >  int pmbus_do_remove(struct i2c_client *client);
> > diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
> > index f1eff6b6c798..153700e35431 100644
> > --- a/drivers/hwmon/pmbus/pmbus_core.c
> > +++ b/drivers/hwmon/pmbus/pmbus_core.c
> > @@ -304,18 +304,24 @@ static int _pmbus_read_byte_data(struct i2c_client *client, int page, int reg)
> > > >  	return pmbus_read_byte_data(client, page, reg);
> >  }
> >  
> > -static void pmbus_clear_fault_page(struct i2c_client *client, int page)
> > +static int pmbus_clear_fault_page(struct i2c_client *client, int page)
> >  {
> > > > -	_pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
> > > > +	return _pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
> >  }
> >  
> > -void pmbus_clear_faults(struct i2c_client *client)
> > +int pmbus_clear_faults(struct i2c_client *client)
> >  {
> > > >  	struct pmbus_data *data = i2c_get_clientdata(client);
> > > > +	int rv;
> > > >  	int i;
> >  
> > > > -	for (i = 0; i < data->info->pages; i++)
> > > > -		pmbus_clear_fault_page(client, i);
> > > > +	for (i = 0; i < data->info->pages; i++) {
> > > > +		rv = pmbus_clear_fault_page(client, i);
> > > > +		if (rv)
> > > > +			return rv;
> > > > +	}
> > +
> > > > +	return 0;
> >  }
> >  EXPORT_SYMBOL_GPL(pmbus_clear_faults);
> >  
> > @@ -333,28 +339,45 @@ static int pmbus_check_status_cml(struct i2c_client *client)
> > > >  	return 0;
> >  }
> >  
> > -static bool pmbus_check_register(struct i2c_client *client,
> > +static int pmbus_check_register(struct i2c_client *client,
> > > >  				 int (*func)(struct i2c_client *client,
> > > >  					     int page, int reg),
> > > >  				 int page, int reg)
> >  {
> > > > +	struct pmbus_data *data;
> > > > +	int check;
> > > >  	int rv;
> > > > -	struct pmbus_data *data = i2c_get_clientdata(client);
> >  
> > > > -	rv = func(client, page, reg);
> > > > -	if (rv >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
> > > > -		rv = pmbus_check_status_cml(client);
> > > > -	pmbus_clear_fault_page(client, -1);
> > > > -	return rv >= 0;
> > > > +	data = i2c_get_clientdata(client);
> > +
> > > > +	/*
> > > > +	 * pmbus_set_page() guards transactions on the requested page matching
> > > > +	 * the current page. This may be done in the execution of func(), but
> > > > +	 * at that point a set-page error is conflated with accessing a
> > > > +	 * non-existent register.
> > > > +	 */
> > > > +	rv = pmbus_set_page(client, page);
> > > > +	if (rv < 0)
> > > > +		return rv;
> > +
> > > > +	check = func(client, page, reg);
> > > > +	if (check >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
> > > > +		check = pmbus_check_status_cml(client);
> > +
> > > > +	rv = pmbus_clear_fault_page(client, -1);
> > > > +	if (rv < 0)
> > > > +		return rv;
> > +
> > > > +	return check >= 0;
> >  }
> >  
> > -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
> > +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
> >  {
> > > >  	return pmbus_check_register(client, _pmbus_read_byte_data, page, reg);
> >  }
> >  EXPORT_SYMBOL_GPL(pmbus_check_byte_register);
> >  
> > -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg)
> > +int pmbus_check_word_register(struct i2c_client *client, int page, int reg)
> >  {
> > > >  	return pmbus_check_register(client, _pmbus_read_word_data, page, reg);
> >  }
> > @@ -390,7 +413,7 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
> >  
> > > >  	mutex_lock(&data->update_lock);
> > > >  	if (time_after(jiffies, data->last_updated + HZ) || !data->valid) {
> > > > -		int i, j;
> > > > +		int i, j, ret;
> >  
> > > >  		for (i = 0; i < info->pages; i++) {
> > > >  			data->status[PB_STATUS_BASE + i]
> > @@ -424,7 +447,13 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
> > > >  							    sensor->page,
> > > >  							    sensor->reg);
> > > >  		}
> > > > -		pmbus_clear_faults(client);
> > +
> > > > +		ret = pmbus_clear_faults(client);
> > > > +		if (ret < 0) {
> > > > +			mutex_unlock(&data->update_lock);
> > > > +			return ERR_PTR(ret);
> > > > +		}
> > +
> > > >  		data->last_updated = jiffies;
> > > >  		data->valid = 1;
> > > >  	}
> > @@ -754,6 +783,9 @@ static ssize_t pmbus_show_boolean(struct device *dev,
> > > >  	struct pmbus_data *data = pmbus_update_device(dev);
> > > >  	int val;
> >  
> > > > +	if (IS_ERR(data))
> > > > +		return PTR_ERR(data);
> > +
> > > >  	val = pmbus_get_boolean(data, boolean, attr->index);
> > > >  	if (val < 0)
> > > >  		return val;
> > @@ -766,6 +798,9 @@ static ssize_t pmbus_show_sensor(struct device *dev,
> > > >  	struct pmbus_data *data = pmbus_update_device(dev);
> > > >  	struct pmbus_sensor *sensor = to_pmbus_sensor(devattr);
> >  
> > > > +	if (IS_ERR(data))
> > > > +		return PTR_ERR(data);
> > +
> > > >  	if (sensor->data < 0)
> > > >  		return sensor->data;
> >  
> > @@ -995,7 +1030,11 @@ static int pmbus_add_limit_attrs(struct i2c_client *client,
> > > >  	struct pmbus_sensor *curr;
> >  
> > > >  	for (i = 0; i < nlimit; i++) {
> > > > -		if (pmbus_check_word_register(client, page, l->reg)) {
> > > > +		ret = pmbus_check_word_register(client, page, l->reg);
> > > > +		if (ret < 0)
> > > > +			return ret;
> > +
> > > > +		if (ret) {
> > > >  			curr = pmbus_add_sensor(data, name, l->attr, index,
> > > >  						page, l->reg, attr->class,
> > > >  						attr->update || l->update,
> > @@ -1041,6 +1080,8 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
> > > >  	if (!base)
> > > >  		return -ENOMEM;
> > > >  	if (attr->sfunc) {
> > > > +		int check;
> > +
> > > >  		ret = pmbus_add_limit_attrs(client, data, info, name,
> > > >  					    index, page, base, attr);
> > > >  		if (ret < 0)
> > @@ -1050,9 +1091,13 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
> > > >  		 * alarm attributes, if there is a global alarm bit, and if
> > > >  		 * the generic status register for this page is accessible.
> > > >  		 */
> > > > -		if (!ret && attr->gbit &&
> > > > -		    pmbus_check_byte_register(client, page,
> > > > -					      data->status_register)) {
> > +
> > > > +		check = pmbus_check_byte_register(client, page,
> > > > +						  data->status_register);
> > > > +		if (check < 0)
> > > > +			return check;
> > +
> > > > +		if (!ret && attr->gbit && check) {
> > > >  			ret = pmbus_add_boolean(data, name, "alarm", index,
> > > >  						NULL, NULL,
> > > >  						PB_STATUS_BASE + page,
> > @@ -1604,8 +1649,12 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
> > > >  			if (!(info->func[page] & pmbus_fan_flags[f]))
> > > >  				break;
> >  
> > > > -			if (!pmbus_check_word_register(client, page,
> > > > -						       pmbus_fan_registers[f]))
> > > > +			ret = pmbus_check_word_register(client, page,
> > > > +						       pmbus_fan_registers[f]);
> > > > +			if (ret < 0)
> > > > +				return ret;
> > +
> > > > +			if (!ret)
> > > >  				break;
> >  
> > > >  			/*
> > @@ -1628,9 +1677,13 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
> > > >  			 * Each fan status register covers multiple fans,
> > > >  			 * so we have to do some magic.
> > > >  			 */
> > > > -			if ((info->func[page] & pmbus_fan_status_flags[f]) &&
> > > > -			    pmbus_check_byte_register(client,
> > > > -					page, pmbus_fan_status_registers[f])) {
> > > > +			ret =  pmbus_check_byte_register(client, page,
> > > > +						pmbus_fan_status_registers[f]);
> > > > +			if (ret < 0)
> > > > +				return ret;
> > +
> > > > +			if ((info->func[page] & pmbus_fan_status_flags[f])
> > > > +					&& ret) {
> > > >  				int base;
> >  
> > > > > >  				if (f > 1)	/* fan 3, 4 */
> > @@ -1696,10 +1749,13 @@ static int pmbus_identify_common(struct i2c_client *client,
> > > >  				 struct pmbus_data *data, int page)
> >  {
> > > >  	int vout_mode = -1;
> > > > +	int rv;
> >  
> > > > -	if (pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE))
> > > > +	rv = pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE);
> > > > +	if (rv == 1)
> > > >  		vout_mode = _pmbus_read_byte_data(client, page,
> > > >  						  PMBUS_VOUT_MODE);
> > +
> > > >  	if (vout_mode >= 0 && vout_mode != 0xff) {
> > > >  		/*
> > > >  		 * Not all chips support the VOUT_MODE command,
> > @@ -1725,8 +1781,7 @@ static int pmbus_identify_common(struct i2c_client *client,
> > > >  		}
> > > >  	}
> >  
> > > > -	pmbus_clear_fault_page(client, page);
> > > > -	return 0;
> > > > +	return pmbus_clear_fault_page(client, page);
> >  }
> >  
> >  static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
> > @@ -1756,7 +1811,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
> > > >  	if (ret >= 0 && (ret & PB_CAPABILITY_ERROR_CHECK))
> > > >  		client->flags |= I2C_CLIENT_PEC;
> >  
> > > > -	pmbus_clear_faults(client);
> > > > +	ret = pmbus_clear_faults(client);
> > > > +	if (ret < 0)
> > > > +		return ret;
> >  
> > > >  	if (info->identify) {
> > > >  		ret = (*info->identify)(client, info);
> > -- 
> > 2.11.0
> >
Guenter Roeck Sept. 6, 2017, 10:51 p.m. UTC | #3
On Wed, Sep 06, 2017 at 10:23:37AM +1000, Andrew Jeffery wrote:
> On Tue, 2017-09-05 at 10:00 -0700, Guenter Roeck wrote:
> > On Tue, Sep 05, 2017 at 05:01:32PM +1000, Andrew Jeffery wrote:
> > > Some functions exposed by pmbus core conflated errors that occurred when
> > > setting the page to access with errors that occurred when accessing
> > > registers in a page. In some cases, this caused legitimate errors to be
> > > hidden under the guise of the register not being supported.
> > > 
> > 
> > I'll have to look into this. If I recall correctly, the reason for not returning
> > errors from the "clear faults" command was that some early chips did not support
> > it, or did not support it on all pages. Those chips would now always fail.
> 
> Yes, that is a concern.
> 
> However, shouldn't the lack of support for CLEAR_FAULTS be a described
> property of the chip or page?
> 
> Regardless, the issue here is the PAGE command is sometimes failing
> (more below). This means we won't have issued a CLEAR_FAULTS against
> the page even if the page supports CLEAR_FAULTS. That to me is
> something that should be propagated back up the call chain, as faults
> that can be cleared will not have been.
> 
We could also consider
- retry
- Add a marker indicating that faults (specifically command errors)
  are unreliable after clearing faults failed

If we can't reliably clear faults, all bets are off.

> > 
> > On a higher level, I am not sure if it is a good idea to return an error
> > from a function intended to clear faults (and nothing else). That seems
> > counterproductive.
> > 
> > Is this a problem you have actually observed ? 
> 
> Unfortunately yes. I was trying to reproduce some issues that we are
> seeing in userspace but wasn't having much luck (getting -EIO on
> reading a hwmon attribute). I ended up instrumenting our I2C bus
> driver, and found that the problem was more prevalent than what the
> read failures in userspace were indicating. The paths that were
> triggering these unreported conditions all traced through the check
> register and clear fault functions, which on analysis were swallowing
> the error.
> 
> > If so, what is the chip ?
> 
> Well, the chip that I'm *communicating* with is the Maxim MAX31785
> Intelligent Fan Controller. As to whether that's what is *causing* the
> PAGE command to fail is still up in the air. I would not be surprised
> if we have other issues in the hardware design.
> 
> I haven't sent a follow-up to the series introducing the MAX31785
> driver because I've been chasing down communication issues. I felt it
> was important to understand whether the device has quirks that need to
> be worked around in the driver, or if our hardware design has bigger
> problems that should be handled in other ways. I've been in touch with
> Maxim who have asserted that some of the problems we're seeing cannot
> be caused by their chip.
> 

Guess I need to dig up my eval board and see if I can reproduce the problem.
Seems you are saying that the problem is always seen when issuing a sequence
of "clear faults" commands on multiple pages ?

The MAX31785 is microcode based, so I would not be entirely surprised if
it sometimes fails to reply if a sequence of <set page, clear faults>
commands is executed.

If possible, you might try reducing the i2c clock. I have not seen that with
Maxim chips, but some other PMBus chips don't operate reliably at 400 KHz.

Guenter

> > 
> > > > > Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
> > > ---
> > >  Documentation/hwmon/pmbus-core   |  12 ++--
> > >  drivers/hwmon/pmbus/pmbus.h      |   6 +-
> > >  drivers/hwmon/pmbus/pmbus_core.c | 115 +++++++++++++++++++++++++++++----------
> > >  3 files changed, 95 insertions(+), 38 deletions(-)
> > > 
> > > diff --git a/Documentation/hwmon/pmbus-core b/Documentation/hwmon/pmbus-core
> > > index 8ed10e9ddfb5..3e9f41bb756f 100644
> > > --- a/Documentation/hwmon/pmbus-core
> > > +++ b/Documentation/hwmon/pmbus-core
> > > @@ -218,17 +218,17 @@ Specifically, it provides the following information.
> > >    This function calls the device specific write_byte function if defined.
> > >    Therefore, it must _not_ be called from that function.
> > >  
> > > -  bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > > +  int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > >  
> > > -  Check if byte register exists. Return true if the register exists, false
> > > -  otherwise.
> > > +  Check if byte register exists. Returns 1 if the register exists, 0 if it does
> > > +  not, and less than zero on an unexpected error.
> > >    This function calls the device specific write_byte function if defined to
> > >    obtain the chip status. Therefore, it must _not_ be called from that function.
> > >  
> > > -  bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > > +  int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > >  
> > > -  Check if word register exists. Return true if the register exists, false
> > > -  otherwise.
> > > +  Check if word register exists. Returns 1 if the register exists, 0 if it does
> > > +  not, and less than zero on an unexpected error.
> > >    This function calls the device specific write_byte function if defined to
> > >    obtain the chip status. Therefore, it must _not_ be called from that function.
> > >  
> > > diff --git a/drivers/hwmon/pmbus/pmbus.h b/drivers/hwmon/pmbus/pmbus.h
> > > index bfcb13bae34b..c53032a04a6f 100644
> > > --- a/drivers/hwmon/pmbus/pmbus.h
> > > +++ b/drivers/hwmon/pmbus/pmbus.h
> > > @@ -413,9 +413,9 @@ int pmbus_write_byte_data(struct i2c_client *client, int page, u8 reg,
> > > > >  			  u8 value);
> > >  int pmbus_update_byte_data(struct i2c_client *client, int page, u8 reg,
> > > > >  			   u8 mask, u8 value);
> > > -void pmbus_clear_faults(struct i2c_client *client);
> > > -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > > -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > > +int pmbus_clear_faults(struct i2c_client *client);
> > > +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > > +int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > >  int pmbus_do_probe(struct i2c_client *client, const struct i2c_device_id *id,
> > > > >  		   struct pmbus_driver_info *info);
> > >  int pmbus_do_remove(struct i2c_client *client);
> > > diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
> > > index f1eff6b6c798..153700e35431 100644
> > > --- a/drivers/hwmon/pmbus/pmbus_core.c
> > > +++ b/drivers/hwmon/pmbus/pmbus_core.c
> > > @@ -304,18 +304,24 @@ static int _pmbus_read_byte_data(struct i2c_client *client, int page, int reg)
> > > > >  	return pmbus_read_byte_data(client, page, reg);
> > >  }
> > >  
> > > -static void pmbus_clear_fault_page(struct i2c_client *client, int page)
> > > +static int pmbus_clear_fault_page(struct i2c_client *client, int page)
> > >  {
> > > > > -	_pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
> > > > > +	return _pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
> > >  }
> > >  
> > > -void pmbus_clear_faults(struct i2c_client *client)
> > > +int pmbus_clear_faults(struct i2c_client *client)
> > >  {
> > > > >  	struct pmbus_data *data = i2c_get_clientdata(client);
> > > > > +	int rv;
> > > > >  	int i;
> > >  
> > > > > -	for (i = 0; i < data->info->pages; i++)
> > > > > -		pmbus_clear_fault_page(client, i);
> > > > > +	for (i = 0; i < data->info->pages; i++) {
> > > > > +		rv = pmbus_clear_fault_page(client, i);
> > > > > +		if (rv)
> > > > > +			return rv;
> > > > > +	}
> > > +
> > > > > +	return 0;
> > >  }
> > >  EXPORT_SYMBOL_GPL(pmbus_clear_faults);
> > >  
> > > @@ -333,28 +339,45 @@ static int pmbus_check_status_cml(struct i2c_client *client)
> > > > >  	return 0;
> > >  }
> > >  
> > > -static bool pmbus_check_register(struct i2c_client *client,
> > > +static int pmbus_check_register(struct i2c_client *client,
> > > > >  				 int (*func)(struct i2c_client *client,
> > > > >  					     int page, int reg),
> > > > >  				 int page, int reg)
> > >  {
> > > > > +	struct pmbus_data *data;
> > > > > +	int check;
> > > > >  	int rv;
> > > > > -	struct pmbus_data *data = i2c_get_clientdata(client);
> > >  
> > > > > -	rv = func(client, page, reg);
> > > > > -	if (rv >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
> > > > > -		rv = pmbus_check_status_cml(client);
> > > > > -	pmbus_clear_fault_page(client, -1);
> > > > > -	return rv >= 0;
> > > > > +	data = i2c_get_clientdata(client);
> > > +
> > > > > +	/*
> > > > > +	 * pmbus_set_page() guards transactions on the requested page matching
> > > > > +	 * the current page. This may be done in the execution of func(), but
> > > > > +	 * at that point a set-page error is conflated with accessing a
> > > > > +	 * non-existent register.
> > > > > +	 */
> > > > > +	rv = pmbus_set_page(client, page);
> > > > > +	if (rv < 0)
> > > > > +		return rv;
> > > +
> > > > > +	check = func(client, page, reg);
> > > > > +	if (check >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
> > > > > +		check = pmbus_check_status_cml(client);
> > > +
> > > > > +	rv = pmbus_clear_fault_page(client, -1);
> > > > > +	if (rv < 0)
> > > > > +		return rv;
> > > +
> > > > > +	return check >= 0;
> > >  }
> > >  
> > > -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
> > > +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
> > >  {
> > > > >  	return pmbus_check_register(client, _pmbus_read_byte_data, page, reg);
> > >  }
> > >  EXPORT_SYMBOL_GPL(pmbus_check_byte_register);
> > >  
> > > -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg)
> > > +int pmbus_check_word_register(struct i2c_client *client, int page, int reg)
> > >  {
> > > > >  	return pmbus_check_register(client, _pmbus_read_word_data, page, reg);
> > >  }
> > > @@ -390,7 +413,7 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
> > >  
> > > > >  	mutex_lock(&data->update_lock);
> > > > >  	if (time_after(jiffies, data->last_updated + HZ) || !data->valid) {
> > > > > -		int i, j;
> > > > > +		int i, j, ret;
> > >  
> > > > >  		for (i = 0; i < info->pages; i++) {
> > > > >  			data->status[PB_STATUS_BASE + i]
> > > @@ -424,7 +447,13 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
> > > > >  							    sensor->page,
> > > > >  							    sensor->reg);
> > > > >  		}
> > > > > -		pmbus_clear_faults(client);
> > > +
> > > > > +		ret = pmbus_clear_faults(client);
> > > > > +		if (ret < 0) {
> > > > > +			mutex_unlock(&data->update_lock);
> > > > > +			return ERR_PTR(ret);
> > > > > +		}
> > > +
> > > > >  		data->last_updated = jiffies;
> > > > >  		data->valid = 1;
> > > > >  	}
> > > @@ -754,6 +783,9 @@ static ssize_t pmbus_show_boolean(struct device *dev,
> > > > >  	struct pmbus_data *data = pmbus_update_device(dev);
> > > > >  	int val;
> > >  
> > > > > +	if (IS_ERR(data))
> > > > > +		return PTR_ERR(data);
> > > +
> > > > >  	val = pmbus_get_boolean(data, boolean, attr->index);
> > > > >  	if (val < 0)
> > > > >  		return val;
> > > @@ -766,6 +798,9 @@ static ssize_t pmbus_show_sensor(struct device *dev,
> > > > >  	struct pmbus_data *data = pmbus_update_device(dev);
> > > > >  	struct pmbus_sensor *sensor = to_pmbus_sensor(devattr);
> > >  
> > > > > +	if (IS_ERR(data))
> > > > > +		return PTR_ERR(data);
> > > +
> > > > >  	if (sensor->data < 0)
> > > > >  		return sensor->data;
> > >  
> > > @@ -995,7 +1030,11 @@ static int pmbus_add_limit_attrs(struct i2c_client *client,
> > > > >  	struct pmbus_sensor *curr;
> > >  
> > > > >  	for (i = 0; i < nlimit; i++) {
> > > > > -		if (pmbus_check_word_register(client, page, l->reg)) {
> > > > > +		ret = pmbus_check_word_register(client, page, l->reg);
> > > > > +		if (ret < 0)
> > > > > +			return ret;
> > > +
> > > > > +		if (ret) {
> > > > >  			curr = pmbus_add_sensor(data, name, l->attr, index,
> > > > >  						page, l->reg, attr->class,
> > > > >  						attr->update || l->update,
> > > @@ -1041,6 +1080,8 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
> > > > >  	if (!base)
> > > > >  		return -ENOMEM;
> > > > >  	if (attr->sfunc) {
> > > > > +		int check;
> > > +
> > > > >  		ret = pmbus_add_limit_attrs(client, data, info, name,
> > > > >  					    index, page, base, attr);
> > > > >  		if (ret < 0)
> > > @@ -1050,9 +1091,13 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
> > > > >  		 * alarm attributes, if there is a global alarm bit, and if
> > > > >  		 * the generic status register for this page is accessible.
> > > > >  		 */
> > > > > -		if (!ret && attr->gbit &&
> > > > > -		    pmbus_check_byte_register(client, page,
> > > > > -					      data->status_register)) {
> > > +
> > > > > +		check = pmbus_check_byte_register(client, page,
> > > > > +						  data->status_register);
> > > > > +		if (check < 0)
> > > > > +			return check;
> > > +
> > > > > +		if (!ret && attr->gbit && check) {
> > > > >  			ret = pmbus_add_boolean(data, name, "alarm", index,
> > > > >  						NULL, NULL,
> > > > >  						PB_STATUS_BASE + page,
> > > @@ -1604,8 +1649,12 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
> > > > >  			if (!(info->func[page] & pmbus_fan_flags[f]))
> > > > >  				break;
> > >  
> > > > > -			if (!pmbus_check_word_register(client, page,
> > > > > -						       pmbus_fan_registers[f]))
> > > > > +			ret = pmbus_check_word_register(client, page,
> > > > > +						       pmbus_fan_registers[f]);
> > > > > +			if (ret < 0)
> > > > > +				return ret;
> > > +
> > > > > +			if (!ret)
> > > > >  				break;
> > >  
> > > > >  			/*
> > > @@ -1628,9 +1677,13 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
> > > > >  			 * Each fan status register covers multiple fans,
> > > > >  			 * so we have to do some magic.
> > > > >  			 */
> > > > > -			if ((info->func[page] & pmbus_fan_status_flags[f]) &&
> > > > > -			    pmbus_check_byte_register(client,
> > > > > -					page, pmbus_fan_status_registers[f])) {
> > > > > +			ret =  pmbus_check_byte_register(client, page,
> > > > > +						pmbus_fan_status_registers[f]);
> > > > > +			if (ret < 0)
> > > > > +				return ret;
> > > +
> > > > > +			if ((info->func[page] & pmbus_fan_status_flags[f])
> > > > > +					&& ret) {
> > > > >  				int base;
> > >  
> > > > > > >  				if (f > 1)	/* fan 3, 4 */
> > > @@ -1696,10 +1749,13 @@ static int pmbus_identify_common(struct i2c_client *client,
> > > > >  				 struct pmbus_data *data, int page)
> > >  {
> > > > >  	int vout_mode = -1;
> > > > > +	int rv;
> > >  
> > > > > -	if (pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE))
> > > > > +	rv = pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE);
> > > > > +	if (rv == 1)
> > > > >  		vout_mode = _pmbus_read_byte_data(client, page,
> > > > >  						  PMBUS_VOUT_MODE);
> > > +
> > > > >  	if (vout_mode >= 0 && vout_mode != 0xff) {
> > > > >  		/*
> > > > >  		 * Not all chips support the VOUT_MODE command,
> > > @@ -1725,8 +1781,7 @@ static int pmbus_identify_common(struct i2c_client *client,
> > > > >  		}
> > > > >  	}
> > >  
> > > > > -	pmbus_clear_fault_page(client, page);
> > > > > -	return 0;
> > > > > +	return pmbus_clear_fault_page(client, page);
> > >  }
> > >  
> > >  static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
> > > @@ -1756,7 +1811,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
> > > > >  	if (ret >= 0 && (ret & PB_CAPABILITY_ERROR_CHECK))
> > > > >  		client->flags |= I2C_CLIENT_PEC;
> > >  
> > > > > -	pmbus_clear_faults(client);
> > > > > +	ret = pmbus_clear_faults(client);
> > > > > +	if (ret < 0)
> > > > > +		return ret;
> > >  
> > > > >  	if (info->identify) {
> > > > >  		ret = (*info->identify)(client, info);
> > > -- 
> > > 2.11.0
> > > 


--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jeffery Sept. 6, 2017, 11:32 p.m. UTC | #4
On Wed, 2017-09-06 at 15:51 -0700, Guenter Roeck wrote:
> On Wed, Sep 06, 2017 at 10:23:37AM +1000, Andrew Jeffery wrote:
> > On Tue, 2017-09-05 at 10:00 -0700, Guenter Roeck wrote:
> > > On Tue, Sep 05, 2017 at 05:01:32PM +1000, Andrew Jeffery wrote:
> > > > Some functions exposed by pmbus core conflated errors that occurred when
> > > > setting the page to access with errors that occurred when accessing
> > > > registers in a page. In some cases, this caused legitimate errors to be
> > > > hidden under the guise of the register not being supported.
> > > > 
> > > 
> > > I'll have to look into this. If I recall correctly, the reason for not returning
> > > errors from the "clear faults" command was that some early chips did not support
> > > it, or did not support it on all pages. Those chips would now always fail.
> > 
> > Yes, that is a concern.
> > 
> > However, shouldn't the lack of support for CLEAR_FAULTS be a described
> > property of the chip or page?
> > 
> > Regardless, the issue here is the PAGE command is sometimes failing
> > (more below). This means we won't have issued a CLEAR_FAULTS against
> > the page even if the page supports CLEAR_FAULTS. That to me is
> > something that should be propagated back up the call chain, as faults
> > that can be cleared will not have been.
> > 

> We could also consider
> - retry

I guess that leads to the usual retries problem: How many? Oneshot? More?

My expectation is that oneshot would be enough, but we'd still need to consider
what to do if that wasn't successful.

Does the retry policy need to be in the kernel? I guess if it's not we would
need all operations in the path to the failure to be idempotent.

> - Add a marker indicating that faults (specifically command errors)
>   are unreliable after clearing faults failed

What kind of marker were you thinking? Something in dmesg? If it's anything
else we'd probably want something to respond to the condition, which nothing
would do currently.

> If we can't reliably clear faults, all bets are off.

Yeah, it's a messy problem. However even if we resolve our issues in hardware
(i.e. it's discovered that it is a design failure or something), I don't think
we should drop a resolution to the problem highlighted by this patch.

> > > 
> > > On a higher level, I am not sure if it is a good idea to return an error
> > > from a function intended to clear faults (and nothing else). That seems
> > > counterproductive.
> > > 
> > > Is this a problem you have actually observed ? 
> > 
> > Unfortunately yes. I was trying to reproduce some issues that we are
> > seeing in userspace but wasn't having much luck (getting -EIO on
> > reading a hwmon attribute). I ended up instrumenting our I2C bus
> > driver, and found that the problem was more prevalent than what the
> > read failures in userspace were indicating. The paths that were
> > triggering these unreported conditions all traced through the check
> > register and clear fault functions, which on analysis were swallowing
> > the error.
> > 
> > > If so, what is the chip ?
> > 
> > Well, the chip that I'm *communicating* with is the Maxim MAX31785
> > Intelligent Fan Controller. As to whether that's what is *causing* the
> > PAGE command to fail is still up in the air. I would not be surprised
> > if we have other issues in the hardware design.
> > 
> > I haven't sent a follow-up to the series introducing the MAX31785
> > driver because I've been chasing down communication issues. I felt it
> > was important to understand whether the device has quirks that need to
> > be worked around in the driver, or if our hardware design has bigger
> > problems that should be handled in other ways. I've been in touch with
> > Maxim who have asserted that some of the problems we're seeing cannot
> > be caused by their chip.
> > 

> Guess I need to dig up my eval board and see if I can reproduce the problem.
> Seems you are saying that the problem is always seen when issuing a sequence
> of "clear faults" commands on multiple pages ?

Yeah. We're also seeing bad behaviour under other command sequences as well,
which lead to this hack of a work-around patch[1].

I'd be very interested in the results of testing against the eval board. I
don't have access to one and it seems Maxim have discontinued them.

[1] https://patchwork.kernel.org/patch/9876083/

> The MAX31785 is microcode based, so I would not be entirely surprised if
> it sometimes fails to reply if a sequence of <set page, clear faults>
> commands is executed.

Have you seen this behaviour in other microcode-based chips?

> If possible, you might try reducing the i2c clock. I have not seen that with
> Maxim chips, but some other PMBus chips don't operate reliably at 400 KHz.

It's already running at 100kHz, which is the maximum clock rate the device is
specified for. I've even underclocked the bus to 75kHz and still observed
issues. Again, I wouldn't be surprised if the problem is something other than
the Maxim chip, the bus that we have it on has a complex topology. I'm working
with hardware people internally to try to isolate the problem.

Cheers,

Andrew

> Guenter

> > > 
> > > > > > Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
> > > > 
> > > > ---
> > > >  Documentation/hwmon/pmbus-core   |  12 ++--
> > > >  drivers/hwmon/pmbus/pmbus.h      |   6 +-
> > > >  drivers/hwmon/pmbus/pmbus_core.c | 115 +++++++++++++++++++++++++++++----------
> > > >  3 files changed, 95 insertions(+), 38 deletions(-)
> > > > 
> > > > diff --git a/Documentation/hwmon/pmbus-core b/Documentation/hwmon/pmbus-core
> > > > index 8ed10e9ddfb5..3e9f41bb756f 100644
> > > > --- a/Documentation/hwmon/pmbus-core
> > > > +++ b/Documentation/hwmon/pmbus-core
> > > > @@ -218,17 +218,17 @@ Specifically, it provides the following information.
> > > >    This function calls the device specific write_byte function if defined.
> > > >    Therefore, it must _not_ be called from that function.
> > > >  
> > > > -  bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > > > +  int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > > >  
> > > > -  Check if byte register exists. Return true if the register exists, false
> > > > -  otherwise.
> > > > +  Check if byte register exists. Returns 1 if the register exists, 0 if it does
> > > > +  not, and less than zero on an unexpected error.
> > > >    This function calls the device specific write_byte function if defined to
> > > >    obtain the chip status. Therefore, it must _not_ be called from that function.
> > > >  
> > > > -  bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > > > +  int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > > >  
> > > > -  Check if word register exists. Return true if the register exists, false
> > > > -  otherwise.
> > > > +  Check if word register exists. Returns 1 if the register exists, 0 if it does
> > > > +  not, and less than zero on an unexpected error.
> > > >    This function calls the device specific write_byte function if defined to
> > > >    obtain the chip status. Therefore, it must _not_ be called from that function.
> > > >  
> > > > diff --git a/drivers/hwmon/pmbus/pmbus.h b/drivers/hwmon/pmbus/pmbus.h
> > > > index bfcb13bae34b..c53032a04a6f 100644
> > > > --- a/drivers/hwmon/pmbus/pmbus.h
> > > > +++ b/drivers/hwmon/pmbus/pmbus.h
> > > > @@ -413,9 +413,9 @@ int pmbus_write_byte_data(struct i2c_client *client, int page, u8 reg,
> > > > > >  			  u8 value);
> > > > 
> > > >  int pmbus_update_byte_data(struct i2c_client *client, int page, u8 reg,
> > > > > >  			   u8 mask, u8 value);
> > > > 
> > > > -void pmbus_clear_faults(struct i2c_client *client);
> > > > -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > > > -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > > > +int pmbus_clear_faults(struct i2c_client *client);
> > > > +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
> > > > +int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
> > > >  int pmbus_do_probe(struct i2c_client *client, const struct i2c_device_id *id,
> > > > > >  		   struct pmbus_driver_info *info);
> > > > 
> > > >  int pmbus_do_remove(struct i2c_client *client);
> > > > diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
> > > > index f1eff6b6c798..153700e35431 100644
> > > > --- a/drivers/hwmon/pmbus/pmbus_core.c
> > > > +++ b/drivers/hwmon/pmbus/pmbus_core.c
> > > > @@ -304,18 +304,24 @@ static int _pmbus_read_byte_data(struct i2c_client *client, int page, int reg)
> > > > > >  	return pmbus_read_byte_data(client, page, reg);
> > > > 
> > > >  }
> > > >  
> > > > -static void pmbus_clear_fault_page(struct i2c_client *client, int page)
> > > > +static int pmbus_clear_fault_page(struct i2c_client *client, int page)
> > > >  {
> > > > > > > > > > > > -	_pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
> > > > > > +	return _pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
> > > > 
> > > >  }
> > > >  
> > > > -void pmbus_clear_faults(struct i2c_client *client)
> > > > +int pmbus_clear_faults(struct i2c_client *client)
> > > >  {
> > > > > > > > > > > >  	struct pmbus_data *data = i2c_get_clientdata(client);
> > > > > > > > > > > > +	int rv;
> > > > > >  	int i;
> > > > 
> > > >  
> > > > > > > > > > > > -	for (i = 0; i < data->info->pages; i++)
> > > > > > > > > > > > -		pmbus_clear_fault_page(client, i);
> > > > > > > > > > > > +	for (i = 0; i < data->info->pages; i++) {
> > > > > > > > > > > > +		rv = pmbus_clear_fault_page(client, i);
> > > > > > > > > > > > +		if (rv)
> > > > > > > > > > > > +			return rv;
> > > > > > +	}
> > > > 
> > > > +
> > > > > > +	return 0;
> > > > 
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(pmbus_clear_faults);
> > > >  
> > > > @@ -333,28 +339,45 @@ static int pmbus_check_status_cml(struct i2c_client *client)
> > > > > >  	return 0;
> > > > 
> > > >  }
> > > >  
> > > > -static bool pmbus_check_register(struct i2c_client *client,
> > > > +static int pmbus_check_register(struct i2c_client *client,
> > > > > > > > > > > >  				 int (*func)(struct i2c_client *client,
> > > > > > > > > > > >  					     int page, int reg),
> > > > > >  				 int page, int reg)
> > > > 
> > > >  {
> > > > > > > > > > > > +	struct pmbus_data *data;
> > > > > > > > > > > > +	int check;
> > > > > > > > > > > >  	int rv;
> > > > > > -	struct pmbus_data *data = i2c_get_clientdata(client);
> > > > 
> > > >  
> > > > > > > > > > > > -	rv = func(client, page, reg);
> > > > > > > > > > > > -	if (rv >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
> > > > > > > > > > > > -		rv = pmbus_check_status_cml(client);
> > > > > > > > > > > > -	pmbus_clear_fault_page(client, -1);
> > > > > > > > > > > > -	return rv >= 0;
> > > > > > +	data = i2c_get_clientdata(client);
> > > > 
> > > > +
> > > > > > > > > > > > +	/*
> > > > > > > > > > > > +	 * pmbus_set_page() guards transactions on the requested page matching
> > > > > > > > > > > > +	 * the current page. This may be done in the execution of func(), but
> > > > > > > > > > > > +	 * at that point a set-page error is conflated with accessing a
> > > > > > > > > > > > +	 * non-existent register.
> > > > > > > > > > > > +	 */
> > > > > > > > > > > > +	rv = pmbus_set_page(client, page);
> > > > > > > > > > > > +	if (rv < 0)
> > > > > > +		return rv;
> > > > 
> > > > +
> > > > > > > > > > > > +	check = func(client, page, reg);
> > > > > > > > > > > > +	if (check >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
> > > > > > +		check = pmbus_check_status_cml(client);
> > > > 
> > > > +
> > > > > > > > > > > > +	rv = pmbus_clear_fault_page(client, -1);
> > > > > > > > > > > > +	if (rv < 0)
> > > > > > +		return rv;
> > > > 
> > > > +
> > > > > > +	return check >= 0;
> > > > 
> > > >  }
> > > >  
> > > > -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
> > > > +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
> > > >  {
> > > > > >  	return pmbus_check_register(client, _pmbus_read_byte_data, page, reg);
> > > > 
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(pmbus_check_byte_register);
> > > >  
> > > > -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg)
> > > > +int pmbus_check_word_register(struct i2c_client *client, int page, int reg)
> > > >  {
> > > > > >  	return pmbus_check_register(client, _pmbus_read_word_data, page, reg);
> > > > 
> > > >  }
> > > > @@ -390,7 +413,7 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
> > > >  
> > > > > > > > > > > >  	mutex_lock(&data->update_lock);
> > > > > > > > > > > >  	if (time_after(jiffies, data->last_updated + HZ) || !data->valid) {
> > > > > > > > > > > > -		int i, j;
> > > > > > +		int i, j, ret;
> > > > 
> > > >  
> > > > > > > > > > > >  		for (i = 0; i < info->pages; i++) {
> > > > > >  			data->status[PB_STATUS_BASE + i]
> > > > 
> > > > @@ -424,7 +447,13 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
> > > > > > > > > > > >  							    sensor->page,
> > > > > > > > > > > >  							    sensor->reg);
> > > > > > > > > > > >  		}
> > > > > > -		pmbus_clear_faults(client);
> > > > 
> > > > +
> > > > > > > > > > > > +		ret = pmbus_clear_faults(client);
> > > > > > > > > > > > +		if (ret < 0) {
> > > > > > > > > > > > +			mutex_unlock(&data->update_lock);
> > > > > > > > > > > > +			return ERR_PTR(ret);
> > > > > > +		}
> > > > 
> > > > +
> > > > > > > > > > > >  		data->last_updated = jiffies;
> > > > > > > > > > > >  		data->valid = 1;
> > > > > >  	}
> > > > 
> > > > @@ -754,6 +783,9 @@ static ssize_t pmbus_show_boolean(struct device *dev,
> > > > > > > > > > > >  	struct pmbus_data *data = pmbus_update_device(dev);
> > > > > >  	int val;
> > > > 
> > > >  
> > > > > > > > > > > > +	if (IS_ERR(data))
> > > > > > +		return PTR_ERR(data);
> > > > 
> > > > +
> > > > > > > > > > > >  	val = pmbus_get_boolean(data, boolean, attr->index);
> > > > > > > > > > > >  	if (val < 0)
> > > > > >  		return val;
> > > > 
> > > > @@ -766,6 +798,9 @@ static ssize_t pmbus_show_sensor(struct device *dev,
> > > > > > > > > > > >  	struct pmbus_data *data = pmbus_update_device(dev);
> > > > > >  	struct pmbus_sensor *sensor = to_pmbus_sensor(devattr);
> > > > 
> > > >  
> > > > > > > > > > > > +	if (IS_ERR(data))
> > > > > > +		return PTR_ERR(data);
> > > > 
> > > > +
> > > > > > > > > > > >  	if (sensor->data < 0)
> > > > > >  		return sensor->data;
> > > > 
> > > >  
> > > > @@ -995,7 +1030,11 @@ static int pmbus_add_limit_attrs(struct i2c_client *client,
> > > > > >  	struct pmbus_sensor *curr;
> > > > 
> > > >  
> > > > > > > > > > > >  	for (i = 0; i < nlimit; i++) {
> > > > > > > > > > > > -		if (pmbus_check_word_register(client, page, l->reg)) {
> > > > > > > > > > > > +		ret = pmbus_check_word_register(client, page, l->reg);
> > > > > > > > > > > > +		if (ret < 0)
> > > > > > +			return ret;
> > > > 
> > > > +
> > > > > > > > > > > > +		if (ret) {
> > > > > > > > > > > >  			curr = pmbus_add_sensor(data, name, l->attr, index,
> > > > > > > > > > > >  						page, l->reg, attr->class,
> > > > > >  						attr->update || l->update,
> > > > 
> > > > @@ -1041,6 +1080,8 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
> > > > > > > > > > > >  	if (!base)
> > > > > > > > > > > >  		return -ENOMEM;
> > > > > > > > > > > >  	if (attr->sfunc) {
> > > > > > +		int check;
> > > > 
> > > > +
> > > > > > > > > > > >  		ret = pmbus_add_limit_attrs(client, data, info, name,
> > > > > > > > > > > >  					    index, page, base, attr);
> > > > > >  		if (ret < 0)
> > > > 
> > > > @@ -1050,9 +1091,13 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
> > > > > > > > > > > >  		 * alarm attributes, if there is a global alarm bit, and if
> > > > > > > > > > > >  		 * the generic status register for this page is accessible.
> > > > > > > > > > > >  		 */
> > > > > > > > > > > > -		if (!ret && attr->gbit &&
> > > > > > > > > > > > -		    pmbus_check_byte_register(client, page,
> > > > > > -					      data->status_register)) {
> > > > 
> > > > +
> > > > > > > > > > > > +		check = pmbus_check_byte_register(client, page,
> > > > > > > > > > > > +						  data->status_register);
> > > > > > > > > > > > +		if (check < 0)
> > > > > > +			return check;
> > > > 
> > > > +
> > > > > > > > > > > > +		if (!ret && attr->gbit && check) {
> > > > > > > > > > > >  			ret = pmbus_add_boolean(data, name, "alarm", index,
> > > > > > > > > > > >  						NULL, NULL,
> > > > > >  						PB_STATUS_BASE + page,
> > > > 
> > > > @@ -1604,8 +1649,12 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
> > > > > > > > > > > >  			if (!(info->func[page] & pmbus_fan_flags[f]))
> > > > > >  				break;
> > > > 
> > > >  
> > > > > > > > > > > > -			if (!pmbus_check_word_register(client, page,
> > > > > > > > > > > > -						       pmbus_fan_registers[f]))
> > > > > > > > > > > > +			ret = pmbus_check_word_register(client, page,
> > > > > > > > > > > > +						       pmbus_fan_registers[f]);
> > > > > > > > > > > > +			if (ret < 0)
> > > > > > +				return ret;
> > > > 
> > > > +
> > > > > > > > > > > > +			if (!ret)
> > > > > >  				break;
> > > > 
> > > >  
> > > > > >  			/*
> > > > 
> > > > @@ -1628,9 +1677,13 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
> > > > > > > > > > > >  			 * Each fan status register covers multiple fans,
> > > > > > > > > > > >  			 * so we have to do some magic.
> > > > > > > > > > > >  			 */
> > > > > > > > > > > > -			if ((info->func[page] & pmbus_fan_status_flags[f]) &&
> > > > > > > > > > > > -			    pmbus_check_byte_register(client,
> > > > > > > > > > > > -					page, pmbus_fan_status_registers[f])) {
> > > > > > > > > > > > +			ret =  pmbus_check_byte_register(client, page,
> > > > > > > > > > > > +						pmbus_fan_status_registers[f]);
> > > > > > > > > > > > +			if (ret < 0)
> > > > > > +				return ret;
> > > > 
> > > > +
> > > > > > > > > > > > +			if ((info->func[page] & pmbus_fan_status_flags[f])
> > > > > > > > > > > > +					&& ret) {
> > > > > >  				int base;
> > > > 
> > > >  
> > > > > > > >  				if (f > 1)	/* fan 3, 4 */
> > > > 
> > > > @@ -1696,10 +1749,13 @@ static int pmbus_identify_common(struct i2c_client *client,
> > > > > >  				 struct pmbus_data *data, int page)
> > > > 
> > > >  {
> > > > > > > > > > > >  	int vout_mode = -1;
> > > > > > +	int rv;
> > > > 
> > > >  
> > > > > > > > > > > > -	if (pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE))
> > > > > > > > > > > > +	rv = pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE);
> > > > > > > > > > > > +	if (rv == 1)
> > > > > > > > > > > >  		vout_mode = _pmbus_read_byte_data(client, page,
> > > > > >  						  PMBUS_VOUT_MODE);
> > > > 
> > > > +
> > > > > > > > > > > >  	if (vout_mode >= 0 && vout_mode != 0xff) {
> > > > > > > > > > > >  		/*
> > > > > >  		 * Not all chips support the VOUT_MODE command,
> > > > 
> > > > @@ -1725,8 +1781,7 @@ static int pmbus_identify_common(struct i2c_client *client,
> > > > > > > > > > > >  		}
> > > > > >  	}
> > > > 
> > > >  
> > > > > > > > > > > > -	pmbus_clear_fault_page(client, page);
> > > > > > > > > > > > -	return 0;
> > > > > > +	return pmbus_clear_fault_page(client, page);
> > > > 
> > > >  }
> > > >  
> > > >  static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
> > > > @@ -1756,7 +1811,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
> > > > > > > > > > > >  	if (ret >= 0 && (ret & PB_CAPABILITY_ERROR_CHECK))
> > > > > >  		client->flags |= I2C_CLIENT_PEC;
> > > > 
> > > >  
> > > > > > > > > > > > -	pmbus_clear_faults(client);
> > > > > > > > > > > > +	ret = pmbus_clear_faults(client);
> > > > > > > > > > > > +	if (ret < 0)
> > > > > > +		return ret;
> > > > 
> > > >  
> > > > > > > > > > > >  	if (info->identify) {
> > > > > >  		ret = (*info->identify)(client, info);
> > > > 
> > > > -- 
> > > > 2.11.0
> > > > 

>
Guenter Roeck Sept. 7, 2017, 2:19 a.m. UTC | #5
On 09/06/2017 04:32 PM, Andrew Jeffery wrote:
> On Wed, 2017-09-06 at 15:51 -0700, Guenter Roeck wrote:
>> On Wed, Sep 06, 2017 at 10:23:37AM +1000, Andrew Jeffery wrote:
>>> On Tue, 2017-09-05 at 10:00 -0700, Guenter Roeck wrote:
>>>> On Tue, Sep 05, 2017 at 05:01:32PM +1000, Andrew Jeffery wrote:
>>>>> Some functions exposed by pmbus core conflated errors that occurred when
>>>>> setting the page to access with errors that occurred when accessing
>>>>> registers in a page. In some cases, this caused legitimate errors to be
>>>>> hidden under the guise of the register not being supported.
>>>>>   
>>>>   
>>>> I'll have to look into this. If I recall correctly, the reason for not returning
>>>> errors from the "clear faults" command was that some early chips did not support
>>>> it, or did not support it on all pages. Those chips would now always fail.
>>>   
>>> Yes, that is a concern.
>>>   
>>> However, shouldn't the lack of support for CLEAR_FAULTS be a described
>>> property of the chip or page?
>>>   
>>> Regardless, the issue here is the PAGE command is sometimes failing
>>> (more below). This means we won't have issued a CLEAR_FAULTS against
>>> the page even if the page supports CLEAR_FAULTS. That to me is
>>> something that should be propagated back up the call chain, as faults
>>> that can be cleared will not have been.
>>>   
>>   
>> We could also consider
>> - retry
> 
> I guess that leads to the usual retries problem: How many? Oneshot? More?
> 
> My expectation is that oneshot would be enough, but we'd still need to consider
> what to do if that wasn't successful.
> 
> Does the retry policy need to be in the kernel? I guess if it's not we would
> need all operations in the path to the failure to be idempotent.
> 
>> - Add a marker indicating that faults (specifically command errors)
>>    are unreliable after clearing faults failed
> 
> What kind of marker were you thinking? Something in dmesg? If it's anything
> else we'd probably want something to respond to the condition, which nothing
> would do currently.
> 
>>   
>> If we can't reliably clear faults, all bets are off.
> 
> Yeah, it's a messy problem. However even if we resolve our issues in hardware
> (i.e. it's discovered that it is a design failure or something), I don't think
> we should drop a resolution to the problem highlighted by this patch.
> 
>>   
>>>>   
>>>> On a higher level, I am not sure if it is a good idea to return an error
>>>> from a function intended to clear faults (and nothing else). That seems
>>>> counterproductive.
>>>>   
>>>> Is this a problem you have actually observed ?
>>>   
>>> Unfortunately yes. I was trying to reproduce some issues that we are
>>> seeing in userspace but wasn't having much luck (getting -EIO on
>>> reading a hwmon attribute). I ended up instrumenting our I2C bus
>>> driver, and found that the problem was more prevalent than what the
>>> read failures in userspace were indicating. The paths that were
>>> triggering these unreported conditions all traced through the check
>>> register and clear fault functions, which on analysis were swallowing
>>> the error.
>>>   
>>>> If so, what is the chip ?
>>>   
>>> Well, the chip that I'm *communicating* with is the Maxim MAX31785
>>> Intelligent Fan Controller. As to whether that's what is *causing* the
>>> PAGE command to fail is still up in the air. I would not be surprised
>>> if we have other issues in the hardware design.
>>>   
>>> I haven't sent a follow-up to the series introducing the MAX31785
>>> driver because I've been chasing down communication issues. I felt it
>>> was important to understand whether the device has quirks that need to
>>> be worked around in the driver, or if our hardware design has bigger
>>> problems that should be handled in other ways. I've been in touch with
>>> Maxim who have asserted that some of the problems we're seeing cannot
>>> be caused by their chip.
>>>   
>>   
>> Guess I need to dig up my eval board and see if I can reproduce the problem.
>> Seems you are saying that the problem is always seen when issuing a sequence
>> of "clear faults" commands on multiple pages ?
> 
> Yeah. We're also seeing bad behaviour under other command sequences as well,
> which lead to this hack of a work-around patch[1].
> 
> I'd be very interested in the results of testing against the eval board. I
> don't have access to one and it seems Maxim have discontinued them.
> 
> [1] https://patchwork.kernel.org/patch/9876083/
> 
>>   
>> The MAX31785 is microcode based, so I would not be entirely surprised if
>> it sometimes fails to reply if a sequence of <set page, clear faults>
>> commands is executed.
> 
> Have you seen this behaviour in other microcode-based chips?
> 

ltc2978 (commit e04d1ce9bbb) and most of the Zilker Labs chips (zl6100.c).

You could try the approach I used in the zl6100 driver: Add a minimum time
between accesses to the chip. I would probably no longer use udelay(), though,
but usleeep_range(), if I were to implement the code today.

>>   
>> If possible, you might try reducing the i2c clock. I have not seen that with
>> Maxim chips, but some other PMBus chips don't operate reliably at 400 KHz.
> 
> It's already running at 100kHz, which is the maximum clock rate the device is
> specified for. I've even underclocked the bus to 75kHz and still observed
> issues. Again, I wouldn't be surprised if the problem is something other than
> the Maxim chip, the bus that we have it on has a complex topology. I'm working
> with hardware people internally to try to isolate the problem.
> 

If you can, you might try something in between. Early TI chips (ucd9000 variants,
some sold as OEM chips to third parties) had a problem with both 100kHz and 400kHz,
but worked fine between about 150 kHz and 350 kHz, for whatever reason. Of course,
maybe that was a signal problem on our boards at the time (the i2c signal routing
was quite complex).

Guenter

> Cheers,
> 
> Andrew
> 
>>   
>> Guenter
>>   
>>>>   
>>>>>>> Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
>>>>>   
>>>>> ---
>>>>>   Documentation/hwmon/pmbus-core   |  12 ++--
>>>>>   drivers/hwmon/pmbus/pmbus.h      |   6 +-
>>>>>   drivers/hwmon/pmbus/pmbus_core.c | 115 +++++++++++++++++++++++++++++----------
>>>>>   3 files changed, 95 insertions(+), 38 deletions(-)
>>>>>   
>>>>> diff --git a/Documentation/hwmon/pmbus-core b/Documentation/hwmon/pmbus-core
>>>>> index 8ed10e9ddfb5..3e9f41bb756f 100644
>>>>> --- a/Documentation/hwmon/pmbus-core
>>>>> +++ b/Documentation/hwmon/pmbus-core
>>>>> @@ -218,17 +218,17 @@ Specifically, it provides the following information.
>>>>>     This function calls the device specific write_byte function if defined.
>>>>>     Therefore, it must _not_ be called from that function.
>>>>>   
>>>>> -  bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
>>>>> +  int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
>>>>>   
>>>>> -  Check if byte register exists. Return true if the register exists, false
>>>>> -  otherwise.
>>>>> +  Check if byte register exists. Returns 1 if the register exists, 0 if it does
>>>>> +  not, and less than zero on an unexpected error.
>>>>>     This function calls the device specific write_byte function if defined to
>>>>>     obtain the chip status. Therefore, it must _not_ be called from that function.
>>>>>   
>>>>> -  bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
>>>>> +  int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
>>>>>   
>>>>> -  Check if word register exists. Return true if the register exists, false
>>>>> -  otherwise.
>>>>> +  Check if word register exists. Returns 1 if the register exists, 0 if it does
>>>>> +  not, and less than zero on an unexpected error.
>>>>>     This function calls the device specific write_byte function if defined to
>>>>>     obtain the chip status. Therefore, it must _not_ be called from that function.
>>>>>   
>>>>> diff --git a/drivers/hwmon/pmbus/pmbus.h b/drivers/hwmon/pmbus/pmbus.h
>>>>> index bfcb13bae34b..c53032a04a6f 100644
>>>>> --- a/drivers/hwmon/pmbus/pmbus.h
>>>>> +++ b/drivers/hwmon/pmbus/pmbus.h
>>>>> @@ -413,9 +413,9 @@ int pmbus_write_byte_data(struct i2c_client *client, int page, u8 reg,
>>>>>>>   			  u8 value);
>>>>>   
>>>>>   int pmbus_update_byte_data(struct i2c_client *client, int page, u8 reg,
>>>>>>>   			   u8 mask, u8 value);
>>>>>   
>>>>> -void pmbus_clear_faults(struct i2c_client *client);
>>>>> -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
>>>>> -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
>>>>> +int pmbus_clear_faults(struct i2c_client *client);
>>>>> +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
>>>>> +int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
>>>>>   int pmbus_do_probe(struct i2c_client *client, const struct i2c_device_id *id,
>>>>>>>   		   struct pmbus_driver_info *info);
>>>>>   
>>>>>   int pmbus_do_remove(struct i2c_client *client);
>>>>> diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
>>>>> index f1eff6b6c798..153700e35431 100644
>>>>> --- a/drivers/hwmon/pmbus/pmbus_core.c
>>>>> +++ b/drivers/hwmon/pmbus/pmbus_core.c
>>>>> @@ -304,18 +304,24 @@ static int _pmbus_read_byte_data(struct i2c_client *client, int page, int reg)
>>>>>>>   	return pmbus_read_byte_data(client, page, reg);
>>>>>   
>>>>>   }
>>>>>   
>>>>> -static void pmbus_clear_fault_page(struct i2c_client *client, int page)
>>>>> +static int pmbus_clear_fault_page(struct i2c_client *client, int page)
>>>>>   {
>>>>>>>>>>>>> -	_pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
>>>>>>> +	return _pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
>>>>>   
>>>>>   }
>>>>>   
>>>>> -void pmbus_clear_faults(struct i2c_client *client)
>>>>> +int pmbus_clear_faults(struct i2c_client *client)
>>>>>   {
>>>>>>>>>>>>>   	struct pmbus_data *data = i2c_get_clientdata(client);
>>>>>>>>>>>>> +	int rv;
>>>>>>>   	int i;
>>>>>   
>>>>>   
>>>>>>>>>>>>> -	for (i = 0; i < data->info->pages; i++)
>>>>>>>>>>>>> -		pmbus_clear_fault_page(client, i);
>>>>>>>>>>>>> +	for (i = 0; i < data->info->pages; i++) {
>>>>>>>>>>>>> +		rv = pmbus_clear_fault_page(client, i);
>>>>>>>>>>>>> +		if (rv)
>>>>>>>>>>>>> +			return rv;
>>>>>>> +	}
>>>>>   
>>>>> +
>>>>>>> +	return 0;
>>>>>   
>>>>>   }
>>>>>   EXPORT_SYMBOL_GPL(pmbus_clear_faults);
>>>>>   
>>>>> @@ -333,28 +339,45 @@ static int pmbus_check_status_cml(struct i2c_client *client)
>>>>>>>   	return 0;
>>>>>   
>>>>>   }
>>>>>   
>>>>> -static bool pmbus_check_register(struct i2c_client *client,
>>>>> +static int pmbus_check_register(struct i2c_client *client,
>>>>>>>>>>>>>   				 int (*func)(struct i2c_client *client,
>>>>>>>>>>>>>   					     int page, int reg),
>>>>>>>   				 int page, int reg)
>>>>>   
>>>>>   {
>>>>>>>>>>>>> +	struct pmbus_data *data;
>>>>>>>>>>>>> +	int check;
>>>>>>>>>>>>>   	int rv;
>>>>>>> -	struct pmbus_data *data = i2c_get_clientdata(client);
>>>>>   
>>>>>   
>>>>>>>>>>>>> -	rv = func(client, page, reg);
>>>>>>>>>>>>> -	if (rv >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
>>>>>>>>>>>>> -		rv = pmbus_check_status_cml(client);
>>>>>>>>>>>>> -	pmbus_clear_fault_page(client, -1);
>>>>>>>>>>>>> -	return rv >= 0;
>>>>>>> +	data = i2c_get_clientdata(client);
>>>>>   
>>>>> +
>>>>>>>>>>>>> +	/*
>>>>>>>>>>>>> +	 * pmbus_set_page() guards transactions on the requested page matching
>>>>>>>>>>>>> +	 * the current page. This may be done in the execution of func(), but
>>>>>>>>>>>>> +	 * at that point a set-page error is conflated with accessing a
>>>>>>>>>>>>> +	 * non-existent register.
>>>>>>>>>>>>> +	 */
>>>>>>>>>>>>> +	rv = pmbus_set_page(client, page);
>>>>>>>>>>>>> +	if (rv < 0)
>>>>>>> +		return rv;
>>>>>   
>>>>> +
>>>>>>>>>>>>> +	check = func(client, page, reg);
>>>>>>>>>>>>> +	if (check >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
>>>>>>> +		check = pmbus_check_status_cml(client);
>>>>>   
>>>>> +
>>>>>>>>>>>>> +	rv = pmbus_clear_fault_page(client, -1);
>>>>>>>>>>>>> +	if (rv < 0)
>>>>>>> +		return rv;
>>>>>   
>>>>> +
>>>>>>> +	return check >= 0;
>>>>>   
>>>>>   }
>>>>>   
>>>>> -bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
>>>>> +int pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
>>>>>   {
>>>>>>>   	return pmbus_check_register(client, _pmbus_read_byte_data, page, reg);
>>>>>   
>>>>>   }
>>>>>   EXPORT_SYMBOL_GPL(pmbus_check_byte_register);
>>>>>   
>>>>> -bool pmbus_check_word_register(struct i2c_client *client, int page, int reg)
>>>>> +int pmbus_check_word_register(struct i2c_client *client, int page, int reg)
>>>>>   {
>>>>>>>   	return pmbus_check_register(client, _pmbus_read_word_data, page, reg);
>>>>>   
>>>>>   }
>>>>> @@ -390,7 +413,7 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
>>>>>   
>>>>>>>>>>>>>   	mutex_lock(&data->update_lock);
>>>>>>>>>>>>>   	if (time_after(jiffies, data->last_updated + HZ) || !data->valid) {
>>>>>>>>>>>>> -		int i, j;
>>>>>>> +		int i, j, ret;
>>>>>   
>>>>>   
>>>>>>>>>>>>>   		for (i = 0; i < info->pages; i++) {
>>>>>>>   			data->status[PB_STATUS_BASE + i]
>>>>>   
>>>>> @@ -424,7 +447,13 @@ static struct pmbus_data *pmbus_update_device(struct device *dev)
>>>>>>>>>>>>>   							    sensor->page,
>>>>>>>>>>>>>   							    sensor->reg);
>>>>>>>>>>>>>   		}
>>>>>>> -		pmbus_clear_faults(client);
>>>>>   
>>>>> +
>>>>>>>>>>>>> +		ret = pmbus_clear_faults(client);
>>>>>>>>>>>>> +		if (ret < 0) {
>>>>>>>>>>>>> +			mutex_unlock(&data->update_lock);
>>>>>>>>>>>>> +			return ERR_PTR(ret);
>>>>>>> +		}
>>>>>   
>>>>> +
>>>>>>>>>>>>>   		data->last_updated = jiffies;
>>>>>>>>>>>>>   		data->valid = 1;
>>>>>>>   	}
>>>>>   
>>>>> @@ -754,6 +783,9 @@ static ssize_t pmbus_show_boolean(struct device *dev,
>>>>>>>>>>>>>   	struct pmbus_data *data = pmbus_update_device(dev);
>>>>>>>   	int val;
>>>>>   
>>>>>   
>>>>>>>>>>>>> +	if (IS_ERR(data))
>>>>>>> +		return PTR_ERR(data);
>>>>>   
>>>>> +
>>>>>>>>>>>>>   	val = pmbus_get_boolean(data, boolean, attr->index);
>>>>>>>>>>>>>   	if (val < 0)
>>>>>>>   		return val;
>>>>>   
>>>>> @@ -766,6 +798,9 @@ static ssize_t pmbus_show_sensor(struct device *dev,
>>>>>>>>>>>>>   	struct pmbus_data *data = pmbus_update_device(dev);
>>>>>>>   	struct pmbus_sensor *sensor = to_pmbus_sensor(devattr);
>>>>>   
>>>>>   
>>>>>>>>>>>>> +	if (IS_ERR(data))
>>>>>>> +		return PTR_ERR(data);
>>>>>   
>>>>> +
>>>>>>>>>>>>>   	if (sensor->data < 0)
>>>>>>>   		return sensor->data;
>>>>>   
>>>>>   
>>>>> @@ -995,7 +1030,11 @@ static int pmbus_add_limit_attrs(struct i2c_client *client,
>>>>>>>   	struct pmbus_sensor *curr;
>>>>>   
>>>>>   
>>>>>>>>>>>>>   	for (i = 0; i < nlimit; i++) {
>>>>>>>>>>>>> -		if (pmbus_check_word_register(client, page, l->reg)) {
>>>>>>>>>>>>> +		ret = pmbus_check_word_register(client, page, l->reg);
>>>>>>>>>>>>> +		if (ret < 0)
>>>>>>> +			return ret;
>>>>>   
>>>>> +
>>>>>>>>>>>>> +		if (ret) {
>>>>>>>>>>>>>   			curr = pmbus_add_sensor(data, name, l->attr, index,
>>>>>>>>>>>>>   						page, l->reg, attr->class,
>>>>>>>   						attr->update || l->update,
>>>>>   
>>>>> @@ -1041,6 +1080,8 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
>>>>>>>>>>>>>   	if (!base)
>>>>>>>>>>>>>   		return -ENOMEM;
>>>>>>>>>>>>>   	if (attr->sfunc) {
>>>>>>> +		int check;
>>>>>   
>>>>> +
>>>>>>>>>>>>>   		ret = pmbus_add_limit_attrs(client, data, info, name,
>>>>>>>>>>>>>   					    index, page, base, attr);
>>>>>>>   		if (ret < 0)
>>>>>   
>>>>> @@ -1050,9 +1091,13 @@ static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
>>>>>>>>>>>>>   		 * alarm attributes, if there is a global alarm bit, and if
>>>>>>>>>>>>>   		 * the generic status register for this page is accessible.
>>>>>>>>>>>>>   		 */
>>>>>>>>>>>>> -		if (!ret && attr->gbit &&
>>>>>>>>>>>>> -		    pmbus_check_byte_register(client, page,
>>>>>>> -					      data->status_register)) {
>>>>>   
>>>>> +
>>>>>>>>>>>>> +		check = pmbus_check_byte_register(client, page,
>>>>>>>>>>>>> +						  data->status_register);
>>>>>>>>>>>>> +		if (check < 0)
>>>>>>> +			return check;
>>>>>   
>>>>> +
>>>>>>>>>>>>> +		if (!ret && attr->gbit && check) {
>>>>>>>>>>>>>   			ret = pmbus_add_boolean(data, name, "alarm", index,
>>>>>>>>>>>>>   						NULL, NULL,
>>>>>>>   						PB_STATUS_BASE + page,
>>>>>   
>>>>> @@ -1604,8 +1649,12 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
>>>>>>>>>>>>>   			if (!(info->func[page] & pmbus_fan_flags[f]))
>>>>>>>   				break;
>>>>>   
>>>>>   
>>>>>>>>>>>>> -			if (!pmbus_check_word_register(client, page,
>>>>>>>>>>>>> -						       pmbus_fan_registers[f]))
>>>>>>>>>>>>> +			ret = pmbus_check_word_register(client, page,
>>>>>>>>>>>>> +						       pmbus_fan_registers[f]);
>>>>>>>>>>>>> +			if (ret < 0)
>>>>>>> +				return ret;
>>>>>   
>>>>> +
>>>>>>>>>>>>> +			if (!ret)
>>>>>>>   				break;
>>>>>   
>>>>>   
>>>>>>>   			/*
>>>>>   
>>>>> @@ -1628,9 +1677,13 @@ static int pmbus_add_fan_attributes(struct i2c_client *client,
>>>>>>>>>>>>>   			 * Each fan status register covers multiple fans,
>>>>>>>>>>>>>   			 * so we have to do some magic.
>>>>>>>>>>>>>   			 */
>>>>>>>>>>>>> -			if ((info->func[page] & pmbus_fan_status_flags[f]) &&
>>>>>>>>>>>>> -			    pmbus_check_byte_register(client,
>>>>>>>>>>>>> -					page, pmbus_fan_status_registers[f])) {
>>>>>>>>>>>>> +			ret =  pmbus_check_byte_register(client, page,
>>>>>>>>>>>>> +						pmbus_fan_status_registers[f]);
>>>>>>>>>>>>> +			if (ret < 0)
>>>>>>> +				return ret;
>>>>>   
>>>>> +
>>>>>>>>>>>>> +			if ((info->func[page] & pmbus_fan_status_flags[f])
>>>>>>>>>>>>> +					&& ret) {
>>>>>>>   				int base;
>>>>>   
>>>>>   
>>>>>>>>>   				if (f > 1)	/* fan 3, 4 */
>>>>>   
>>>>> @@ -1696,10 +1749,13 @@ static int pmbus_identify_common(struct i2c_client *client,
>>>>>>>   				 struct pmbus_data *data, int page)
>>>>>   
>>>>>   {
>>>>>>>>>>>>>   	int vout_mode = -1;
>>>>>>> +	int rv;
>>>>>   
>>>>>   
>>>>>>>>>>>>> -	if (pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE))
>>>>>>>>>>>>> +	rv = pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE);
>>>>>>>>>>>>> +	if (rv == 1)
>>>>>>>>>>>>>   		vout_mode = _pmbus_read_byte_data(client, page,
>>>>>>>   						  PMBUS_VOUT_MODE);
>>>>>   
>>>>> +
>>>>>>>>>>>>>   	if (vout_mode >= 0 && vout_mode != 0xff) {
>>>>>>>>>>>>>   		/*
>>>>>>>   		 * Not all chips support the VOUT_MODE command,
>>>>>   
>>>>> @@ -1725,8 +1781,7 @@ static int pmbus_identify_common(struct i2c_client *client,
>>>>>>>>>>>>>   		}
>>>>>>>   	}
>>>>>   
>>>>>   
>>>>>>>>>>>>> -	pmbus_clear_fault_page(client, page);
>>>>>>>>>>>>> -	return 0;
>>>>>>> +	return pmbus_clear_fault_page(client, page);
>>>>>   
>>>>>   }
>>>>>   
>>>>>   static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
>>>>> @@ -1756,7 +1811,9 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
>>>>>>>>>>>>>   	if (ret >= 0 && (ret & PB_CAPABILITY_ERROR_CHECK))
>>>>>>>   		client->flags |= I2C_CLIENT_PEC;
>>>>>   
>>>>>   
>>>>>>>>>>>>> -	pmbus_clear_faults(client);
>>>>>>>>>>>>> +	ret = pmbus_clear_faults(client);
>>>>>>>>>>>>> +	if (ret < 0)
>>>>>>> +		return ret;
>>>>>   
>>>>>   
>>>>>>>>>>>>>   	if (info->identify) {
>>>>>>>   		ret = (*info->identify)(client, info);
>>>>>   
>>>>> -- 
>>>>> 2.11.0
>>>>>   
>>   
>>   

--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Guenter Roeck Sept. 7, 2017, 1:40 p.m. UTC | #6
On 09/06/2017 04:32 PM, Andrew Jeffery wrote:

>>   
>> Guess I need to dig up my eval board and see if I can reproduce the problem.
>> Seems you are saying that the problem is always seen when issuing a sequence
>> of "clear faults" commands on multiple pages ?
> 
> Yeah. We're also seeing bad behaviour under other command sequences as well,
> which lead to this hack of a work-around patch[1].
> 
> I'd be very interested in the results of testing against the eval board. I
> don't have access to one and it seems Maxim have discontinued them.
> 

Do you have a somewhat reliable means to reproduce the problem ?

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jeffery Sept. 7, 2017, 3:22 p.m. UTC | #7
On Thu, 2017-09-07 at 06:40 -0700, Guenter Roeck wrote:
> On 09/06/2017 04:32 PM, Andrew Jeffery wrote:
> 
> > >   
> > > Guess I need to dig up my eval board and see if I can reproduce the problem.
> > > Seems you are saying that the problem is always seen when issuing a sequence
> > > of "clear faults" commands on multiple pages ?
> > 
> > Yeah. We're also seeing bad behaviour under other command sequences as well,
> > which lead to this hack of a work-around patch[1].
> > 
> > I'd be very interested in the results of testing against the eval board. I
> > don't have access to one and it seems Maxim have discontinued them.
> > 
> 
> Do you have a somewhat reliable means to reproduce the problem ?

It seems we hit a bunch of problems by just continually
binding/unbinding the driver, if you don't apply that hacky oneshot
retry patch. We can hit problems (in our design?) with something like:

# cd /sys/bus/i2c/drivers/max31785; \
	echo $addr > unbind; \
	while echo $addr > bind; \
	do echo $addr > unbind; echo -n .; done;

It should hit issues covered by this patch, as the register checks are
used in the operations used by probe.

Andrew
Guenter Roeck Sept. 8, 2017, 12:27 a.m. UTC | #8
On 09/07/2017 08:22 AM, Andrew Jeffery wrote:
> On Thu, 2017-09-07 at 06:40 -0700, Guenter Roeck wrote:
>> On 09/06/2017 04:32 PM, Andrew Jeffery wrote:
>>
>>>>    
>>>> Guess I need to dig up my eval board and see if I can reproduce the problem.
>>>> Seems you are saying that the problem is always seen when issuing a sequence
>>>> of "clear faults" commands on multiple pages ?
>>>
>>> Yeah. We're also seeing bad behaviour under other command sequences as well,
>>> which lead to this hack of a work-around patch[1].
>>>
>>> I'd be very interested in the results of testing against the eval board. I
>>> don't have access to one and it seems Maxim have discontinued them.
>>>
>>
>> Do you have a somewhat reliable means to reproduce the problem ?
> 
> It seems we hit a bunch of problems by just continually
> binding/unbinding the driver, if you don't apply that hacky oneshot
> retry patch. We can hit problems (in our design?) with something like:
> 
> # cd /sys/bus/i2c/drivers/max31785; \
> 	echo $addr > unbind; \
> 	while echo $addr > bind; \
> 	do echo $addr > unbind; echo -n .; done;
> 
> It should hit issues covered by this patch, as the register checks are
> used in the operations used by probe.
> 

Hmm ... I didn't use your driver but my prototype driver which also supports
temperature and voltage attributes, so if anything it should create more
stress on the chip. No error so far, after running the script for a couple
of minutes. How long does it take for errors to appear, and how do I see
that there is an error ? Does the driver fail to instantiate ?

Guenter
--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jeffery Sept. 8, 2017, 1:02 a.m. UTC | #9
On Thu, 2017-09-07 at 17:27 -0700, Guenter Roeck wrote:
> On 09/07/2017 08:22 AM, Andrew Jeffery wrote:
> > On Thu, 2017-09-07 at 06:40 -0700, Guenter Roeck wrote:
> > > On 09/06/2017 04:32 PM, Andrew Jeffery wrote:
> > > 
> > > > >    
> > > > > Guess I need to dig up my eval board and see if I can reproduce the problem.
> > > > > Seems you are saying that the problem is always seen when issuing a sequence
> > > > > of "clear faults" commands on multiple pages ?
> > > > 
> > > > Yeah. We're also seeing bad behaviour under other command sequences as well,
> > > > which lead to this hack of a work-around patch[1].
> > > > 
> > > > I'd be very interested in the results of testing against the eval board. I
> > > > don't have access to one and it seems Maxim have discontinued them.
> > > > 
> > > 
> > > Do you have a somewhat reliable means to reproduce the problem ?
> > 
> > It seems we hit a bunch of problems by just continually
> > binding/unbinding the driver, if you don't apply that hacky oneshot
> > retry patch. We can hit problems (in our design?) with something like:
> > 
> > # cd /sys/bus/i2c/drivers/max31785; \
> > 	echo $addr > unbind; \
> > 	while echo $addr > bind; \
> > 	do echo $addr > unbind; echo -n .; done;
> > 
> > It should hit issues covered by this patch, as the register checks are
> > used in the operations used by probe.
> > 
> 
> Hmm ... I didn't use your driver but my prototype driver which also supports
> temperature and voltage attributes, so if anything it should create more
> stress on the chip.

I did add the temp and voltage attributes...

Any chance you can give mine a try? I don't know what I would have done
to invoke this kind of behaviour, so it would be useful to know whether
or not it happens with one driver but not the other.

>  No error so far, after running the script for a couple
> of minutes. How long does it take for errors to appear, and how do I see
> that there is an error ? 

I'm seeing failures after anything from a handful of bind/unbinds, to
hundreds of bind/unbinds. It seems to vary. 

> Does the driver fail to instantiate ?

Typically probe fails so the loop exits. It usually gets -EIO and the
shell spits out "No such device".

Thanks for testing, it's a useful data point for us hunting down the
source of our problems.

Andrew
Guenter Roeck Sept. 8, 2017, 1:26 a.m. UTC | #10
On 09/07/2017 06:02 PM, Andrew Jeffery wrote:
> On Thu, 2017-09-07 at 17:27 -0700, Guenter Roeck wrote:
>> On 09/07/2017 08:22 AM, Andrew Jeffery wrote:
>>> On Thu, 2017-09-07 at 06:40 -0700, Guenter Roeck wrote:
>>>> On 09/06/2017 04:32 PM, Andrew Jeffery wrote:
>>>>
>>>>>>     
>>>>>> Guess I need to dig up my eval board and see if I can reproduce the problem.
>>>>>> Seems you are saying that the problem is always seen when issuing a sequence
>>>>>> of "clear faults" commands on multiple pages ?
>>>>>
>>>>> Yeah. We're also seeing bad behaviour under other command sequences as well,
>>>>> which lead to this hack of a work-around patch[1].
>>>>>
>>>>> I'd be very interested in the results of testing against the eval board. I
>>>>> don't have access to one and it seems Maxim have discontinued them.
>>>>>
>>>>
>>>> Do you have a somewhat reliable means to reproduce the problem ?
>>>
>>> It seems we hit a bunch of problems by just continually
>>> binding/unbinding the driver, if you don't apply that hacky oneshot
>>> retry patch. We can hit problems (in our design?) with something like:
>>>
>>> # cd /sys/bus/i2c/drivers/max31785; \
>>> 	echo $addr > unbind; \
>>> 	while echo $addr > bind; \
>>> 	do echo $addr > unbind; echo -n .; done;
>>>
>>> It should hit issues covered by this patch, as the register checks are
>>> used in the operations used by probe.
>>>
>>
>> Hmm ... I didn't use your driver but my prototype driver which also supports
>> temperature and voltage attributes, so if anything it should create more
>> stress on the chip.
> 
> I did add the temp and voltage attributes...
> 
> Any chance you can give mine a try? I don't know what I would have done
> to invoke this kind of behaviour, so it would be useful to know whether
> or not it happens with one driver but not the other.
> 

Will do.

>>   No error so far, after running the script for a couple
>> of minutes. How long does it take for errors to appear, and how do I see
>> that there is an error ?
> 
> I'm seeing failures after anything from a handful of bind/unbinds, to
> hundreds of bind/unbinds. It seems to vary.
> 
>> Does the driver fail to instantiate ?
> 
> Typically probe fails so the loop exits. It usually gets -EIO and the
> shell spits out "No such device".
> 
> Thanks for testing, it's a useful data point for us hunting down the
> source of our problems.
> 
I aborted the test after ~2,500 loops without error.

Guenter
--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jeffery Sept. 8, 2017, 2:06 a.m. UTC | #11
On Thu, 2017-09-07 at 18:26 -0700, Guenter Roeck wrote:
> On 09/07/2017 06:02 PM, Andrew Jeffery wrote:
> > On Thu, 2017-09-07 at 17:27 -0700, Guenter Roeck wrote:
> > > On 09/07/2017 08:22 AM, Andrew Jeffery wrote:
> > > > On Thu, 2017-09-07 at 06:40 -0700, Guenter Roeck wrote:
> > > > > On 09/06/2017 04:32 PM, Andrew Jeffery wrote:
> > > > > 
> > > > > > >     
> > > > > > > Guess I need to dig up my eval board and see if I can reproduce the problem.
> > > > > > > Seems you are saying that the problem is always seen when issuing a sequence
> > > > > > > of "clear faults" commands on multiple pages ?
> > > > > > 
> > > > > > Yeah. We're also seeing bad behaviour under other command sequences as well,
> > > > > > which lead to this hack of a work-around patch[1].
> > > > > > 
> > > > > > I'd be very interested in the results of testing against the eval board. I
> > > > > > don't have access to one and it seems Maxim have discontinued them.
> > > > > > 
> > > > > 
> > > > > Do you have a somewhat reliable means to reproduce the problem ?
> > > > 
> > > > It seems we hit a bunch of problems by just continually
> > > > binding/unbinding the driver, if you don't apply that hacky oneshot
> > > > retry patch. We can hit problems (in our design?) with something like:
> > > > 
> > > > # cd /sys/bus/i2c/drivers/max31785; \
> > > > 	echo $addr > unbind; \
> > > > 	while echo $addr > bind; \
> > > > 	do echo $addr > unbind; echo -n .; done;
> > > > 
> > > > It should hit issues covered by this patch, as the register checks are
> > > > used in the operations used by probe.
> > > > 
> > > 
> > > Hmm ... I didn't use your driver but my prototype driver which also supports
> > > temperature and voltage attributes, so if anything it should create more
> > > stress on the chip.
> > 
> > I did add the temp and voltage attributes...
> > 
> > Any chance you can give mine a try? I don't know what I would have done
> > to invoke this kind of behaviour, so it would be useful to know whether
> > or not it happens with one driver but not the other.
> > 
> 
> Will do.

Thanks. For reference, here's a devicetree description:

https://github.com/openbmc/linux/blob/dev-4.10/arch/arm/boot/dts/aspeed-bmc-opp-witherspoon.dts#L283


> 
> > >   No error so far, after running the script for a couple
> > > of minutes. How long does it take for errors to appear, and how do I see
> > > that there is an error ?
> > 
> > I'm seeing failures after anything from a handful of bind/unbinds, to
> > hundreds of bind/unbinds. It seems to vary.
> > 
> > > Does the driver fail to instantiate ?
> > 
> > Typically probe fails so the loop exits. It usually gets -EIO and the
> > shell spits out "No such device".
> > 
> > Thanks for testing, it's a useful data point for us hunting down the
> > source of our problems.
> > 
> 
> I aborted the test after ~2,500 loops without error.

Yeah, I'd consider that fairly stable.

Cheers,

Andrew
Guenter Roeck Sept. 8, 2017, 2:17 a.m. UTC | #12
On 09/07/2017 07:06 PM, Andrew Jeffery wrote:
> On Thu, 2017-09-07 at 18:26 -0700, Guenter Roeck wrote:
>> On 09/07/2017 06:02 PM, Andrew Jeffery wrote:
>>> On Thu, 2017-09-07 at 17:27 -0700, Guenter Roeck wrote:
>>>> On 09/07/2017 08:22 AM, Andrew Jeffery wrote:
>>>>> On Thu, 2017-09-07 at 06:40 -0700, Guenter Roeck wrote:
>>>>>> On 09/06/2017 04:32 PM, Andrew Jeffery wrote:
>>>>>>
>>>>>>>>      
>>>>>>>> Guess I need to dig up my eval board and see if I can reproduce the problem.
>>>>>>>> Seems you are saying that the problem is always seen when issuing a sequence
>>>>>>>> of "clear faults" commands on multiple pages ?
>>>>>>>
>>>>>>> Yeah. We're also seeing bad behaviour under other command sequences as well,
>>>>>>> which lead to this hack of a work-around patch[1].
>>>>>>>
>>>>>>> I'd be very interested in the results of testing against the eval board. I
>>>>>>> don't have access to one and it seems Maxim have discontinued them.
>>>>>>>
>>>>>>
>>>>>> Do you have a somewhat reliable means to reproduce the problem ?
>>>>>
>>>>> It seems we hit a bunch of problems by just continually
>>>>> binding/unbinding the driver, if you don't apply that hacky oneshot
>>>>> retry patch. We can hit problems (in our design?) with something like:
>>>>>
>>>>> # cd /sys/bus/i2c/drivers/max31785; \
>>>>> 	echo $addr > unbind; \
>>>>> 	while echo $addr > bind; \
>>>>> 	do echo $addr > unbind; echo -n .; done;
>>>>>
>>>>> It should hit issues covered by this patch, as the register checks are
>>>>> used in the operations used by probe.
>>>>>
>>>>
>>>> Hmm ... I didn't use your driver but my prototype driver which also supports
>>>> temperature and voltage attributes, so if anything it should create more
>>>> stress on the chip.
>>>
>>> I did add the temp and voltage attributes...
>>>
>>> Any chance you can give mine a try? I don't know what I would have done
>>> to invoke this kind of behaviour, so it would be useful to know whether
>>> or not it happens with one driver but not the other.
>>>
>>
>> Will do.
> 
> Thanks. For reference, here's a devicetree description:
> 
> https://github.com/openbmc/linux/blob/dev-4.10/arch/arm/boot/dts/aspeed-bmc-opp-witherspoon.dts#L283
> 

I can't test with devicetree. x86 system.

2,100+ iterations with your driver, no failures.

Either it is because my chip is a MAX31785 (not A), or the configuration makes a difference,
or it is your hardware.

I'll try to connect a couple of fans next (so far I did without) and try again.

Guenter
--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jeffery Sept. 8, 2017, 2:51 a.m. UTC | #13
On Thu, 2017-09-07 at 19:17 -0700, Guenter Roeck wrote:
> On 09/07/2017 07:06 PM, Andrew Jeffery wrote:
> > On Thu, 2017-09-07 at 18:26 -0700, Guenter Roeck wrote:
> > > On 09/07/2017 06:02 PM, Andrew Jeffery wrote:
> > > > On Thu, 2017-09-07 at 17:27 -0700, Guenter Roeck wrote:
> > > > > On 09/07/2017 08:22 AM, Andrew Jeffery wrote:
> > > > > > On Thu, 2017-09-07 at 06:40 -0700, Guenter Roeck wrote:
> > > > > > > On 09/06/2017 04:32 PM, Andrew Jeffery wrote:
> > > > > > > 
> > > > > > > > >      
> > > > > > > > > Guess I need to dig up my eval board and see if I can reproduce the problem.
> > > > > > > > > Seems you are saying that the problem is always seen when issuing a sequence
> > > > > > > > > of "clear faults" commands on multiple pages ?
> > > > > > > > 
> > > > > > > > Yeah. We're also seeing bad behaviour under other command sequences as well,
> > > > > > > > which lead to this hack of a work-around patch[1].
> > > > > > > > 
> > > > > > > > I'd be very interested in the results of testing against the eval board. I
> > > > > > > > don't have access to one and it seems Maxim have discontinued them.
> > > > > > > > 
> > > > > > > 
> > > > > > > Do you have a somewhat reliable means to reproduce the problem ?
> > > > > > 
> > > > > > It seems we hit a bunch of problems by just continually
> > > > > > binding/unbinding the driver, if you don't apply that hacky oneshot
> > > > > > retry patch. We can hit problems (in our design?) with something like:
> > > > > > 
> > > > > > # cd /sys/bus/i2c/drivers/max31785; \
> > > > > > 	echo $addr > unbind; \
> > > > > > 	while echo $addr > bind; \
> > > > > > 	do echo $addr > unbind; echo -n .; done;
> > > > > > 
> > > > > > It should hit issues covered by this patch, as the register checks are
> > > > > > used in the operations used by probe.
> > > > > > 
> > > > > 
> > > > > Hmm ... I didn't use your driver but my prototype driver which also supports
> > > > > temperature and voltage attributes, so if anything it should create more
> > > > > stress on the chip.
> > > > 
> > > > I did add the temp and voltage attributes...
> > > > 
> > > > Any chance you can give mine a try? I don't know what I would have done
> > > > to invoke this kind of behaviour, so it would be useful to know whether
> > > > or not it happens with one driver but not the other.
> > > > 
> > > 
> > > Will do.
> > 
> > Thanks. For reference, here's a devicetree description:
> > 
> > https://github.com/openbmc/linux/blob/dev-4.10/arch/arm/boot/dts/aspeed-bmc-opp-witherspoon.dts#L283
> > 
> 
> I can't test with devicetree. x86 system.
> 
> 2,100+ iterations with your driver, no failures.

Great. I really appreciate your testing here, so thanks for your
efforts. I owe you a few drinks if we ever happen to meet.

> 
> Either it is because my chip is a MAX31785 (not A), or the configuration makes a difference,
> or it is your hardware.

Yep. My understanding is the A variant is just a difference of
microcode, but who knows what affect that could have. 

> 
> I'll try to connect a couple of fans next (so far I did without) and try again.

Keep me posted if you do.

Thanks again.

Andrew

> 
> Guenter
Andrew Jeffery Sept. 8, 2017, 3:40 a.m. UTC | #14
On Fri, 2017-09-08 at 12:51 +1000, Andrew Jeffery wrote:
> > I can't test with devicetree. x86 system.
> > 
> > 2,100+ iterations with your driver, no failures.
> 
> Great. I really appreciate your testing here, so thanks for your
> efforts. I owe you a few drinks if we ever happen to meet.

Actually, on that, how did you chop out the devicetree parts? Did you
keep the configuration writes at the end of max31785_of_fan_config()
and max31785_of_tmp_config()? Or did you just not call them? These two
functions cause the bulk of the bus traffic with on probe.

Andrew
Guenter Roeck Sept. 8, 2017, 4:43 a.m. UTC | #15
On 09/07/2017 08:40 PM, Andrew Jeffery wrote:
> On Fri, 2017-09-08 at 12:51 +1000, Andrew Jeffery wrote:
>>> I can't test with devicetree. x86 system.
>>>   
>>> 2,100+ iterations with your driver, no failures.
>>
>> Great. I really appreciate your testing here, so thanks for your
>> efforts. I owe you a few drinks if we ever happen to meet.
> 
> Actually, on that, how did you chop out the devicetree parts? Did you
> keep the configuration writes at the end of max31785_of_fan_config()
> and max31785_of_tmp_config()? Or did you just not call them? These two
> functions cause the bulk of the bus traffic with on probe.
> 

I didn't change to code, just compiled and run it. Guess that means
this part was skipped.

I'll replaced the fan configuration with some hard-coded values.
We'll see if it makes a difference.

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jeffery Sept. 8, 2017, 5:08 a.m. UTC | #16
On Thu, 2017-09-07 at 21:43 -0700, Guenter Roeck wrote:
> On 09/07/2017 08:40 PM, Andrew Jeffery wrote:
> > On Fri, 2017-09-08 at 12:51 +1000, Andrew Jeffery wrote:
> > > > I can't test with devicetree. x86 system.
> > > >   
> > > > 2,100+ iterations with your driver, no failures.
> > > 
> > > Great. I really appreciate your testing here, so thanks for your
> > > efforts. I owe you a few drinks if we ever happen to meet.
> > 
> > Actually, on that, how did you chop out the devicetree parts? Did you
> > keep the configuration writes at the end of max31785_of_fan_config()
> > and max31785_of_tmp_config()? Or did you just not call them? These two
> > functions cause the bulk of the bus traffic with on probe.
> > 
> 
> I didn't change to code, just compiled and run it. Guess that means
> this part was skipped.

Right, yeah looking at it a bit more, dev->of_node will be NULL for
for_each_child_of_node(dev->of_node, child), therefore the loop body
won't execute.

> 
> I'll replaced the fan configuration with some hard-coded values.
> We'll see if it makes a difference.

Sounds good.

Andrew
Guenter Roeck Sept. 8, 2017, 5:14 a.m. UTC | #17
On 09/07/2017 08:40 PM, Andrew Jeffery wrote:
> On Fri, 2017-09-08 at 12:51 +1000, Andrew Jeffery wrote:
>>> I can't test with devicetree. x86 system.
>>>   
>>> 2,100+ iterations with your driver, no failures.
>>
>> Great. I really appreciate your testing here, so thanks for your
>> efforts. I owe you a few drinks if we ever happen to meet.
> 
> Actually, on that, how did you chop out the devicetree parts? Did you
> keep the configuration writes at the end of max31785_of_fan_config()
> and max31785_of_tmp_config()? Or did you just not call them? These two
> functions cause the bulk of the bus traffic with on probe.
> 
fan config hardcoded, four fans connected. Still no failure.

Guenter

--
To unsubscribe from this list: send the line "unsubscribe linux-hwmon" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Jeffery Sept. 8, 2017, 5:16 a.m. UTC | #18
On Thu, 2017-09-07 at 22:14 -0700, Guenter Roeck wrote:
> On 09/07/2017 08:40 PM, Andrew Jeffery wrote:
> > On Fri, 2017-09-08 at 12:51 +1000, Andrew Jeffery wrote:
> > > > I can't test with devicetree. x86 system.
> > > >   
> > > > 2,100+ iterations with your driver, no failures.
> > > 
> > > Great. I really appreciate your testing here, so thanks for your
> > > efforts. I owe you a few drinks if we ever happen to meet.
> > 
> > Actually, on that, how did you chop out the devicetree parts? Did you
> > keep the configuration writes at the end of max31785_of_fan_config()
> > and max31785_of_tmp_config()? Or did you just not call them? These two
> > functions cause the bulk of the bus traffic with on probe.
> > 
> 
> fan config hardcoded, four fans connected. Still no failure.

Great. Thanks again for your efforts here.

Andrew
diff mbox

Patch

diff --git a/Documentation/hwmon/pmbus-core b/Documentation/hwmon/pmbus-core
index 8ed10e9ddfb5..3e9f41bb756f 100644
--- a/Documentation/hwmon/pmbus-core
+++ b/Documentation/hwmon/pmbus-core
@@ -218,17 +218,17 @@  Specifically, it provides the following information.
   This function calls the device specific write_byte function if defined.
   Therefore, it must _not_ be called from that function.
 
-  bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
+  int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
 
-  Check if byte register exists. Return true if the register exists, false
-  otherwise.
+  Check if byte register exists. Returns 1 if the register exists, 0 if it does
+  not, and less than zero on an unexpected error.
   This function calls the device specific write_byte function if defined to
   obtain the chip status. Therefore, it must _not_ be called from that function.
 
-  bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
+  int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
 
-  Check if word register exists. Return true if the register exists, false
-  otherwise.
+  Check if word register exists. Returns 1 if the register exists, 0 if it does
+  not, and less than zero on an unexpected error.
   This function calls the device specific write_byte function if defined to
   obtain the chip status. Therefore, it must _not_ be called from that function.
 
diff --git a/drivers/hwmon/pmbus/pmbus.h b/drivers/hwmon/pmbus/pmbus.h
index bfcb13bae34b..c53032a04a6f 100644
--- a/drivers/hwmon/pmbus/pmbus.h
+++ b/drivers/hwmon/pmbus/pmbus.h
@@ -413,9 +413,9 @@  int pmbus_write_byte_data(struct i2c_client *client, int page, u8 reg,
 			  u8 value);
 int pmbus_update_byte_data(struct i2c_client *client, int page, u8 reg,
 			   u8 mask, u8 value);
-void pmbus_clear_faults(struct i2c_client *client);
-bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
-bool pmbus_check_word_register(struct i2c_client *client, int page, int reg);
+int pmbus_clear_faults(struct i2c_client *client);
+int pmbus_check_byte_register(struct i2c_client *client, int page, int reg);
+int pmbus_check_word_register(struct i2c_client *client, int page, int reg);
 int pmbus_do_probe(struct i2c_client *client, const struct i2c_device_id *id,
 		   struct pmbus_driver_info *info);
 int pmbus_do_remove(struct i2c_client *client);
diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index f1eff6b6c798..153700e35431 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -304,18 +304,24 @@  static int _pmbus_read_byte_data(struct i2c_client *client, int page, int reg)
 	return pmbus_read_byte_data(client, page, reg);
 }
 
-static void pmbus_clear_fault_page(struct i2c_client *client, int page)
+static int pmbus_clear_fault_page(struct i2c_client *client, int page)
 {
-	_pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
+	return _pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
 }
 
-void pmbus_clear_faults(struct i2c_client *client)
+int pmbus_clear_faults(struct i2c_client *client)
 {
 	struct pmbus_data *data = i2c_get_clientdata(client);
+	int rv;
 	int i;
 
-	for (i = 0; i < data->info->pages; i++)
-		pmbus_clear_fault_page(client, i);
+	for (i = 0; i < data->info->pages; i++) {
+		rv = pmbus_clear_fault_page(client, i);
+		if (rv)
+			return rv;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pmbus_clear_faults);
 
@@ -333,28 +339,45 @@  static int pmbus_check_status_cml(struct i2c_client *client)
 	return 0;
 }
 
-static bool pmbus_check_register(struct i2c_client *client,
+static int pmbus_check_register(struct i2c_client *client,
 				 int (*func)(struct i2c_client *client,
 					     int page, int reg),
 				 int page, int reg)
 {
+	struct pmbus_data *data;
+	int check;
 	int rv;
-	struct pmbus_data *data = i2c_get_clientdata(client);
 
-	rv = func(client, page, reg);
-	if (rv >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
-		rv = pmbus_check_status_cml(client);
-	pmbus_clear_fault_page(client, -1);
-	return rv >= 0;
+	data = i2c_get_clientdata(client);
+
+	/*
+	 * pmbus_set_page() guards transactions on the requested page matching
+	 * the current page. This may be done in the execution of func(), but
+	 * at that point a set-page error is conflated with accessing a
+	 * non-existent register.
+	 */
+	rv = pmbus_set_page(client, page);
+	if (rv < 0)
+		return rv;
+
+	check = func(client, page, reg);
+	if (check >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
+		check = pmbus_check_status_cml(client);
+
+	rv = pmbus_clear_fault_page(client, -1);
+	if (rv < 0)
+		return rv;
+
+	return check >= 0;
 }
 
-bool pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
+int pmbus_check_byte_register(struct i2c_client *client, int page, int reg)
 {
 	return pmbus_check_register(client, _pmbus_read_byte_data, page, reg);
 }
 EXPORT_SYMBOL_GPL(pmbus_check_byte_register);
 
-bool pmbus_check_word_register(struct i2c_client *client, int page, int reg)
+int pmbus_check_word_register(struct i2c_client *client, int page, int reg)
 {
 	return pmbus_check_register(client, _pmbus_read_word_data, page, reg);
 }
@@ -390,7 +413,7 @@  static struct pmbus_data *pmbus_update_device(struct device *dev)
 
 	mutex_lock(&data->update_lock);
 	if (time_after(jiffies, data->last_updated + HZ) || !data->valid) {
-		int i, j;
+		int i, j, ret;
 
 		for (i = 0; i < info->pages; i++) {
 			data->status[PB_STATUS_BASE + i]
@@ -424,7 +447,13 @@  static struct pmbus_data *pmbus_update_device(struct device *dev)
 							    sensor->page,
 							    sensor->reg);
 		}
-		pmbus_clear_faults(client);
+
+		ret = pmbus_clear_faults(client);
+		if (ret < 0) {
+			mutex_unlock(&data->update_lock);
+			return ERR_PTR(ret);
+		}
+
 		data->last_updated = jiffies;
 		data->valid = 1;
 	}
@@ -754,6 +783,9 @@  static ssize_t pmbus_show_boolean(struct device *dev,
 	struct pmbus_data *data = pmbus_update_device(dev);
 	int val;
 
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
 	val = pmbus_get_boolean(data, boolean, attr->index);
 	if (val < 0)
 		return val;
@@ -766,6 +798,9 @@  static ssize_t pmbus_show_sensor(struct device *dev,
 	struct pmbus_data *data = pmbus_update_device(dev);
 	struct pmbus_sensor *sensor = to_pmbus_sensor(devattr);
 
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
 	if (sensor->data < 0)
 		return sensor->data;
 
@@ -995,7 +1030,11 @@  static int pmbus_add_limit_attrs(struct i2c_client *client,
 	struct pmbus_sensor *curr;
 
 	for (i = 0; i < nlimit; i++) {
-		if (pmbus_check_word_register(client, page, l->reg)) {
+		ret = pmbus_check_word_register(client, page, l->reg);
+		if (ret < 0)
+			return ret;
+
+		if (ret) {
 			curr = pmbus_add_sensor(data, name, l->attr, index,
 						page, l->reg, attr->class,
 						attr->update || l->update,
@@ -1041,6 +1080,8 @@  static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
 	if (!base)
 		return -ENOMEM;
 	if (attr->sfunc) {
+		int check;
+
 		ret = pmbus_add_limit_attrs(client, data, info, name,
 					    index, page, base, attr);
 		if (ret < 0)
@@ -1050,9 +1091,13 @@  static int pmbus_add_sensor_attrs_one(struct i2c_client *client,
 		 * alarm attributes, if there is a global alarm bit, and if
 		 * the generic status register for this page is accessible.
 		 */
-		if (!ret && attr->gbit &&
-		    pmbus_check_byte_register(client, page,
-					      data->status_register)) {
+
+		check = pmbus_check_byte_register(client, page,
+						  data->status_register);
+		if (check < 0)
+			return check;
+
+		if (!ret && attr->gbit && check) {
 			ret = pmbus_add_boolean(data, name, "alarm", index,
 						NULL, NULL,
 						PB_STATUS_BASE + page,
@@ -1604,8 +1649,12 @@  static int pmbus_add_fan_attributes(struct i2c_client *client,
 			if (!(info->func[page] & pmbus_fan_flags[f]))
 				break;
 
-			if (!pmbus_check_word_register(client, page,
-						       pmbus_fan_registers[f]))
+			ret = pmbus_check_word_register(client, page,
+						       pmbus_fan_registers[f]);
+			if (ret < 0)
+				return ret;
+
+			if (!ret)
 				break;
 
 			/*
@@ -1628,9 +1677,13 @@  static int pmbus_add_fan_attributes(struct i2c_client *client,
 			 * Each fan status register covers multiple fans,
 			 * so we have to do some magic.
 			 */
-			if ((info->func[page] & pmbus_fan_status_flags[f]) &&
-			    pmbus_check_byte_register(client,
-					page, pmbus_fan_status_registers[f])) {
+			ret =  pmbus_check_byte_register(client, page,
+						pmbus_fan_status_registers[f]);
+			if (ret < 0)
+				return ret;
+
+			if ((info->func[page] & pmbus_fan_status_flags[f])
+					&& ret) {
 				int base;
 
 				if (f > 1)	/* fan 3, 4 */
@@ -1696,10 +1749,13 @@  static int pmbus_identify_common(struct i2c_client *client,
 				 struct pmbus_data *data, int page)
 {
 	int vout_mode = -1;
+	int rv;
 
-	if (pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE))
+	rv = pmbus_check_byte_register(client, page, PMBUS_VOUT_MODE);
+	if (rv == 1)
 		vout_mode = _pmbus_read_byte_data(client, page,
 						  PMBUS_VOUT_MODE);
+
 	if (vout_mode >= 0 && vout_mode != 0xff) {
 		/*
 		 * Not all chips support the VOUT_MODE command,
@@ -1725,8 +1781,7 @@  static int pmbus_identify_common(struct i2c_client *client,
 		}
 	}
 
-	pmbus_clear_fault_page(client, page);
-	return 0;
+	return pmbus_clear_fault_page(client, page);
 }
 
 static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
@@ -1756,7 +1811,9 @@  static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
 	if (ret >= 0 && (ret & PB_CAPABILITY_ERROR_CHECK))
 		client->flags |= I2C_CLIENT_PEC;
 
-	pmbus_clear_faults(client);
+	ret = pmbus_clear_faults(client);
+	if (ret < 0)
+		return ret;
 
 	if (info->identify) {
 		ret = (*info->identify)(client, info);