diff mbox

[v4,4/4] thermal: Add Tegra SOCTHERM thermal management driver

Message ID 1408100430-29461-1-git-send-email-mperttunen@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: Eduardo Valentin
Headers show

Commit Message

Mikko Perttunen Aug. 15, 2014, 11 a.m. UTC
This adds support for the Tegra SOCTHERM thermal sensing and management
system found in the Tegra124 system-on-chip. This initial driver supports
temperature polling for four thermal zones.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
---
v4:
- constified struct
- added bunch of defines for masks and shifts
- added comment to translate_temp
- changed translate_temp to take u16

 drivers/thermal/Kconfig          |  10 +
 drivers/thermal/Makefile         |   1 +
 drivers/thermal/tegra_soctherm.c | 458 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 469 insertions(+)
 create mode 100644 drivers/thermal/tegra_soctherm.c

Comments

Juha-Matti Tilli Aug. 19, 2014, 2:09 p.m. UTC | #1
> This adds support for the Tegra SOCTHERM thermal sensing and management
> system found in the Tegra124 system-on-chip. This initial driver supports
> temperature polling for four thermal zones.

I have several comments about this patch. Overall, the code looks clean,
way cleaner than NVIDIA's internal soc_therm driver. I adopted your code
to run on the internal firmware of a Tegra124 SoC. Additionally, I
tested the code as-is on a Jetson TK1.

My test shows that the temperature readings look sane and do vary with
load, so the code seems to work. However, I have some concerns about the
calibration values calculated by this code and the error handling of the
code.

Originally, I thought the fuse offsets were incorrect but as it turns
out, the official Linux kernel starts counting the fuses at a different
location than NVIDIA's internal kernel.

[snip]
> +static int calculate_shared_calibration(struct tsensor_shared_calibration *r)
> +{
> +	u32 val;
> +	u32 shifted_cp, shifted_ft;
> +	int err;
> +
> +	err = tegra_fuse_readl(FUSE_TSENSOR8_CALIB, &val);
> +	if (err)
> +		return err;
> +	r->base_cp = val & FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK;
> +	r->base_ft = (val & FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK)
> +		>> FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT;
> +
> +	err = tegra_fuse_readl(FUSE_SPARE_REALIGNMENT_REG_0, &val);
> +	if (err)
> +		return err;
> +	shifted_cp = sign_extend32(val, 5);
> +	val = ((val & FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK)
> +		>> FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT);
> +	shifted_ft = sign_extend32(val, 4);
> +
> +	r->actual_temp_cp = 2 * NOMINAL_CALIB_CP_T124 + shifted_cp;
> +	r->actual_temp_ft = 2 * NOMINAL_CALIB_FT_T124 + shifted_ft;
> +
> +	return 0;
> +}
> +
> +static int calculate_tsensor_calibration(
> +	struct tegra_tsensor *sensor,
> +	struct tsensor_shared_calibration shared,
> +	u32 *calib
> +)
> +{
> +	u32 val;
> +	s32 actual_tsensor_ft, actual_tsensor_cp;
> +	s32 delta_sens, delta_temp;
> +	s32 mult, div;
> +	s16 therma, thermb;
> +	int err;
> +
> +	err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
> +	if (err)
> +		return err;
> +
> +	actual_tsensor_cp = (shared.base_cp * 64) + sign_extend32(val, 12);
> +	val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK)
> +		>> FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
> +	actual_tsensor_ft = (shared.base_ft * 32) + sign_extend32(val, 12);
> +
> +	delta_sens = actual_tsensor_ft - actual_tsensor_cp;
> +	delta_temp = shared.actual_temp_ft - shared.actual_temp_cp;
> +
> +	mult = t124_tsensor_config.pdiv * t124_tsensor_config.tsample_ate;
> +	div = t124_tsensor_config.tsample * t124_tsensor_config.pdiv_ate;
> +
> +	therma = div_s64((s64) delta_temp * (1LL << 13) * mult,
> +		(s64) delta_sens * div);
> +	thermb = div_s64(((s64) actual_tsensor_ft * shared.actual_temp_cp) -
> +		((s64) actual_tsensor_cp * shared.actual_temp_ft),
> +		(s64) delta_sens);
> +
> +	therma = div_s64((s64) therma * sensor->fuse_corr_alpha,
> +		(s64) 1000000LL);
> +	thermb = div_s64((s64) thermb * sensor->fuse_corr_alpha +
> +		sensor->fuse_corr_beta,
> +		(s64) 1000000LL);
> +
> +	*calib = ((u16)(therma) << SENSOR_CONFIG2_THERMA_SHIFT) |
> +		((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
> +
> +	return 0;
> +}
> +
> +static int enable_tsensor(struct tegra_soctherm *tegra,
> +			  struct tegra_tsensor *sensor,
> +			  struct tsensor_shared_calibration shared)
> +{
> +	void * __iomem base = tegra->regs + sensor->base;
> +	unsigned int val;
> +	u32 calib;
> +	int err;
> +
> +	err = calculate_tsensor_calibration(sensor, shared, &calib);
> +	if (err)
> +		return err;
> +
> +	val = 0;
> +	val |= t124_tsensor_config.tall << SENSOR_CONFIG0_TALL_SHIFT;
> +	writel(val, base + SENSOR_CONFIG0);
> +
> +	val = 0;
> +	val |= (t124_tsensor_config.tsample - 1) <<
> +		SENSOR_CONFIG1_TSAMPLE_SHIFT;
> +	val |= t124_tsensor_config.tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
> +	val |= t124_tsensor_config.ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
> +	val |= SENSOR_CONFIG1_TEMP_ENABLE;
> +	writel(val, base + SENSOR_CONFIG1);
> +
> +	writel(calib, base + SENSOR_CONFIG2);
> +
> +	return 0;
> +}

This code writes different values to SENSOR_CONFIG2 registers than what
the NVIDIA's internal soc_therm driver does. Because the calibration
value calculation should be based on the same fuses that NVIDIA's
internal driver reads, I believe the value calculated and eventually
written to SENSOR_CONFIG2 should be identical with NVIDIA's internal
driver. That is not the case now.

I first loaded the NVIDIA's internal soc_therm driver and then loaded
code adopted from your driver running on Tegra's firmware. Here's a log
of how the CONFIG2 values are changed by this code to different values
than NVIDIA's internal soc_therm driver originally sets them to:

CONFIG2: 5b0f813 -> 5c6f7fb
CONFIG2: 5e8f7cd -> 5fff7b4
CONFIG2: 5bdf7ce -> 5d3f7b5
CONFIG2: 596f831 -> 5aaf81a
CONFIG2: 675f6b5 -> 68cf698
CONFIG2: 6d6f641 -> 6eff623
CONFIG2: 641f72b -> 659f710
CONFIG2: 590f861 -> 5a5f84a

On the left, there's the value set by NVIDIA's internal soc_therm driver
and on the right, there's the value set by code adopted from your
driver. My modifications to the code are minor, and I don't think they
could explain why the CONFIG2 values set are different.

If you want them, I can supply you the values of the fuses on my
development board.

The temperature readings look sane and do vary with load, but I think
they're a bit different than what NVIDIA's internal soc_therm driver
gives. I'm not entirely sure if the calibration values set by your
driver or the calibration values set by NVIDIA's internal soc_therm
driver are correct, but it seems to me that only one of these drivers
can be correct.

The values written to CONFIG1 and CONFIG0 do seem sane.

Since there are divisions being done, some sort of explicit protection
from divisions by zero should perhaps be considered, although this can
happen only if the fuses for some reason read all zeroes. I'm not sure
if that's possible with real hardware.

Where does this code come from? It does not definitely come from
NVIDIA's internal soc_therm driver, as it looks entirely different. And
it calculates different calibration values. If you wrote the code, you
should consider commenting it since there are a lot of complex
calculations going on that are not obvious to the reader. For example,
what do "cp" and "ft" mean? Are they acronyms? If so, they should be
explained.

[snip]
> +	for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
> +		struct tegra_thermctl_zone *zone =
> +			devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
> +		if (!zone) {
> +			err = -ENOMEM;

Shouldn't this one have a --i line like the next IS_ERR block?

> +			goto unregister_tzs;
> +		}
> +
> +		zone->temp_reg = tegra->regs + thermctl_temp_offsets[i];
> +		zone->temp_shift = thermctl_temp_shifts[i];
> +
> +		tz = thermal_zone_of_sensor_register(
> +			&pdev->dev, i, zone, tegra_thermctl_get_temp, NULL);
> +		if (IS_ERR(tz)) {
> +			err = PTR_ERR(tz);
> +			dev_err(&pdev->dev, "failed to register sensor: %d\n",
> +				err);
> +			--i;
> +			goto unregister_tzs;
> +		}
> +
> +		tegra->thermctl_tzs[i] = tz;
> +	}
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mikko Perttunen Aug. 19, 2014, 2:31 p.m. UTC | #2
On 19/08/14 17:09, Juha-Matti Tilli wrote:
>> This adds support for the Tegra SOCTHERM thermal sensing and management
>> system found in the Tegra124 system-on-chip. This initial driver supports
>> temperature polling for four thermal zones.
>
> I have several comments about this patch. Overall, the code looks clean,
> way cleaner than NVIDIA's internal soc_therm driver. I adopted your code
> to run on the internal firmware of a Tegra124 SoC. Additionally, I
> tested the code as-is on a Jetson TK1.
>
> My test shows that the temperature readings look sane and do vary with
> load, so the code seems to work. However, I have some concerns about the
> calibration values calculated by this code and the error handling of the
> code.
>
> Originally, I thought the fuse offsets were incorrect but as it turns
> out, the official Linux kernel starts counting the fuses at a different
> location than NVIDIA's internal kernel.
>
> [snip]
>> +static int calculate_shared_calibration(struct tsensor_shared_calibration *r)
>> +{
>> +	u32 val;
>> +	u32 shifted_cp, shifted_ft;
>> +	int err;
>> +
>> +	err = tegra_fuse_readl(FUSE_TSENSOR8_CALIB, &val);
>> +	if (err)
>> +		return err;
>> +	r->base_cp = val & FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK;
>> +	r->base_ft = (val & FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK)
>> +		>> FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT;
>> +
>> +	err = tegra_fuse_readl(FUSE_SPARE_REALIGNMENT_REG_0, &val);
>> +	if (err)
>> +		return err;
>> +	shifted_cp = sign_extend32(val, 5);
>> +	val = ((val & FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK)
>> +		>> FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT);
>> +	shifted_ft = sign_extend32(val, 4);
>> +
>> +	r->actual_temp_cp = 2 * NOMINAL_CALIB_CP_T124 + shifted_cp;
>> +	r->actual_temp_ft = 2 * NOMINAL_CALIB_FT_T124 + shifted_ft;
>> +
>> +	return 0;
>> +}
>> +
>> +static int calculate_tsensor_calibration(
>> +	struct tegra_tsensor *sensor,
>> +	struct tsensor_shared_calibration shared,
>> +	u32 *calib
>> +)
>> +{
>> +	u32 val;
>> +	s32 actual_tsensor_ft, actual_tsensor_cp;
>> +	s32 delta_sens, delta_temp;
>> +	s32 mult, div;
>> +	s16 therma, thermb;
>> +	int err;
>> +
>> +	err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
>> +	if (err)
>> +		return err;
>> +
>> +	actual_tsensor_cp = (shared.base_cp * 64) + sign_extend32(val, 12);
>> +	val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK)
>> +		>> FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
>> +	actual_tsensor_ft = (shared.base_ft * 32) + sign_extend32(val, 12);
>> +
>> +	delta_sens = actual_tsensor_ft - actual_tsensor_cp;
>> +	delta_temp = shared.actual_temp_ft - shared.actual_temp_cp;
>> +
>> +	mult = t124_tsensor_config.pdiv * t124_tsensor_config.tsample_ate;
>> +	div = t124_tsensor_config.tsample * t124_tsensor_config.pdiv_ate;
>> +
>> +	therma = div_s64((s64) delta_temp * (1LL << 13) * mult,
>> +		(s64) delta_sens * div);
>> +	thermb = div_s64(((s64) actual_tsensor_ft * shared.actual_temp_cp) -
>> +		((s64) actual_tsensor_cp * shared.actual_temp_ft),
>> +		(s64) delta_sens);
>> +
>> +	therma = div_s64((s64) therma * sensor->fuse_corr_alpha,
>> +		(s64) 1000000LL);
>> +	thermb = div_s64((s64) thermb * sensor->fuse_corr_alpha +
>> +		sensor->fuse_corr_beta,
>> +		(s64) 1000000LL);
>> +
>> +	*calib = ((u16)(therma) << SENSOR_CONFIG2_THERMA_SHIFT) |
>> +		((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
>> +
>> +	return 0;
>> +}
>> +
>> +static int enable_tsensor(struct tegra_soctherm *tegra,
>> +			  struct tegra_tsensor *sensor,
>> +			  struct tsensor_shared_calibration shared)
>> +{
>> +	void * __iomem base = tegra->regs + sensor->base;
>> +	unsigned int val;
>> +	u32 calib;
>> +	int err;
>> +
>> +	err = calculate_tsensor_calibration(sensor, shared, &calib);
>> +	if (err)
>> +		return err;
>> +
>> +	val = 0;
>> +	val |= t124_tsensor_config.tall << SENSOR_CONFIG0_TALL_SHIFT;
>> +	writel(val, base + SENSOR_CONFIG0);
>> +
>> +	val = 0;
>> +	val |= (t124_tsensor_config.tsample - 1) <<
>> +		SENSOR_CONFIG1_TSAMPLE_SHIFT;
>> +	val |= t124_tsensor_config.tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
>> +	val |= t124_tsensor_config.ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
>> +	val |= SENSOR_CONFIG1_TEMP_ENABLE;
>> +	writel(val, base + SENSOR_CONFIG1);
>> +
>> +	writel(calib, base + SENSOR_CONFIG2);
>> +
>> +	return 0;
>> +}
>
> This code writes different values to SENSOR_CONFIG2 registers than what
> the NVIDIA's internal soc_therm driver does. Because the calibration
> value calculation should be based on the same fuses that NVIDIA's
> internal driver reads, I believe the value calculated and eventually
> written to SENSOR_CONFIG2 should be identical with NVIDIA's internal
> driver. That is not the case now.
>
> I first loaded the NVIDIA's internal soc_therm driver and then loaded
> code adopted from your driver running on Tegra's firmware. Here's a log
> of how the CONFIG2 values are changed by this code to different values
> than NVIDIA's internal soc_therm driver originally sets them to:
>
> CONFIG2: 5b0f813 -> 5c6f7fb
> CONFIG2: 5e8f7cd -> 5fff7b4
> CONFIG2: 5bdf7ce -> 5d3f7b5
> CONFIG2: 596f831 -> 5aaf81a
> CONFIG2: 675f6b5 -> 68cf698
> CONFIG2: 6d6f641 -> 6eff623
> CONFIG2: 641f72b -> 659f710
> CONFIG2: 590f861 -> 5a5f84a
>
> On the left, there's the value set by NVIDIA's internal soc_therm driver
> and on the right, there's the value set by code adopted from your
> driver. My modifications to the code are minor, and I don't think they
> could explain why the CONFIG2 values set are different.

The reason is that the downstream driver uses a function called 
div64_s64_precise to calculate the values. Apparently the results will 
be more accurate, but in my (admittedly brief) testing, the difference 
was somewhat negligible, so I opted for a simpler implementation. The 
precision of the sensors is not very high anyway (only 0.5C).

>
> If you want them, I can supply you the values of the fuses on my
> development board.
>
> The temperature readings look sane and do vary with load, but I think
> they're a bit different than what NVIDIA's internal soc_therm driver
> gives. I'm not entirely sure if the calibration values set by your
> driver or the calibration values set by NVIDIA's internal soc_therm
> driver are correct, but it seems to me that only one of these drivers
> can be correct.

How much do the readings differ in your tests?

>
> The values written to CONFIG1 and CONFIG0 do seem sane.
>
> Since there are divisions being done, some sort of explicit protection
> from divisions by zero should perhaps be considered, although this can
> happen only if the fuses for some reason read all zeroes. I'm not sure
> if that's possible with real hardware.

Even the earliest hardware should have something fused, so pretty much 
impossible, I'd think. Still, I guess I can throw in a check.

>
> Where does this code come from? It does not definitely come from
> NVIDIA's internal soc_therm driver, as it looks entirely different. And
> it calculates different calibration values. If you wrote the code, you
> should consider commenting it since there are a lot of complex
> calculations going on that are not obvious to the reader. For example,
> what do "cp" and "ft" mean? Are they acronyms? If so, they should be
> explained.

The calculation code does come from the downstream kernel; in the 
downstream kernel it's just split between multiple files. At least the 
soctherm driver and the tegra124 fuse code. I have simplified it quite a 
bit when porting over. As for the complex calculations or CP and FT, I 
don't really have a good picture of what they are doing, either.

>
> [snip]
>> +	for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
>> +		struct tegra_thermctl_zone *zone =
>> +			devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
>> +		if (!zone) {
>> +			err = -ENOMEM;
>
> Shouldn't this one have a --i line like the next IS_ERR block?

Well spotted!

>
>> +			goto unregister_tzs;
>> +		}
>> +
>> +		zone->temp_reg = tegra->regs + thermctl_temp_offsets[i];
>> +		zone->temp_shift = thermctl_temp_shifts[i];
>> +
>> +		tz = thermal_zone_of_sensor_register(
>> +			&pdev->dev, i, zone, tegra_thermctl_get_temp, NULL);
>> +		if (IS_ERR(tz)) {
>> +			err = PTR_ERR(tz);
>> +			dev_err(&pdev->dev, "failed to register sensor: %d\n",
>> +				err);
>> +			--i;
>> +			goto unregister_tzs;
>> +		}
>> +
>> +		tegra->thermctl_tzs[i] = tz;
>> +	}

Thanks,
Mikko
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eduardo Valentin Aug. 19, 2014, 2:33 p.m. UTC | #3
Juha-Matti, moro,

On Tue, Aug 19, 2014 at 10:09 AM, Juha-Matti Tilli
<juha-matti.tilli@iki.fi> wrote:
>> This adds support for the Tegra SOCTHERM thermal sensing and management
>> system found in the Tegra124 system-on-chip. This initial driver supports
>> temperature polling for four thermal zones.
>
> I have several comments about this patch. Overall, the code looks clean,
> way cleaner than NVIDIA's internal soc_therm driver. I adopted your code
> to run on the internal firmware of a Tegra124 SoC. Additionally, I
> tested the code as-is on a Jetson TK1.

Thanks for the testing effort!

>
> My test shows that the temperature readings look sane and do vary with
> load, so the code seems to work. However, I have some concerns about the
> calibration values calculated by this code and the error handling of the
> code.
>
> Originally, I thought the fuse offsets were incorrect but as it turns
> out, the official Linux kernel starts counting the fuses at a different
> location than NVIDIA's internal kernel.
>

This is a major concern. Juha-Matti and Mikko, can you please cross
check if this driver is accessing to the correct fuse register
location?

> [snip]
>> +static int calculate_shared_calibration(struct tsensor_shared_calibration *r)
>> +{
>> +     u32 val;
>> +     u32 shifted_cp, shifted_ft;
>> +     int err;
>> +
>> +     err = tegra_fuse_readl(FUSE_TSENSOR8_CALIB, &val);
>> +     if (err)
>> +             return err;
>> +     r->base_cp = val & FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK;
>> +     r->base_ft = (val & FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK)
>> +             >> FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT;
>> +
>> +     err = tegra_fuse_readl(FUSE_SPARE_REALIGNMENT_REG_0, &val);
>> +     if (err)
>> +             return err;
>> +     shifted_cp = sign_extend32(val, 5);
>> +     val = ((val & FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK)
>> +             >> FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT);
>> +     shifted_ft = sign_extend32(val, 4);
>> +
>> +     r->actual_temp_cp = 2 * NOMINAL_CALIB_CP_T124 + shifted_cp;
>> +     r->actual_temp_ft = 2 * NOMINAL_CALIB_FT_T124 + shifted_ft;
>> +
>> +     return 0;
>> +}
>> +
>> +static int calculate_tsensor_calibration(
>> +     struct tegra_tsensor *sensor,
>> +     struct tsensor_shared_calibration shared,
>> +     u32 *calib
>> +)
>> +{
>> +     u32 val;
>> +     s32 actual_tsensor_ft, actual_tsensor_cp;
>> +     s32 delta_sens, delta_temp;
>> +     s32 mult, div;
>> +     s16 therma, thermb;
>> +     int err;
>> +
>> +     err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
>> +     if (err)
>> +             return err;
>> +
>> +     actual_tsensor_cp = (shared.base_cp * 64) + sign_extend32(val, 12);
>> +     val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK)
>> +             >> FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
>> +     actual_tsensor_ft = (shared.base_ft * 32) + sign_extend32(val, 12);
>> +
>> +     delta_sens = actual_tsensor_ft - actual_tsensor_cp;
>> +     delta_temp = shared.actual_temp_ft - shared.actual_temp_cp;
>> +
>> +     mult = t124_tsensor_config.pdiv * t124_tsensor_config.tsample_ate;
>> +     div = t124_tsensor_config.tsample * t124_tsensor_config.pdiv_ate;
>> +
>> +     therma = div_s64((s64) delta_temp * (1LL << 13) * mult,
>> +             (s64) delta_sens * div);
>> +     thermb = div_s64(((s64) actual_tsensor_ft * shared.actual_temp_cp) -
>> +             ((s64) actual_tsensor_cp * shared.actual_temp_ft),
>> +             (s64) delta_sens);
>> +
>> +     therma = div_s64((s64) therma * sensor->fuse_corr_alpha,
>> +             (s64) 1000000LL);
>> +     thermb = div_s64((s64) thermb * sensor->fuse_corr_alpha +
>> +             sensor->fuse_corr_beta,
>> +             (s64) 1000000LL);
>> +
>> +     *calib = ((u16)(therma) << SENSOR_CONFIG2_THERMA_SHIFT) |
>> +             ((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
>> +
>> +     return 0;
>> +}
>> +
>> +static int enable_tsensor(struct tegra_soctherm *tegra,
>> +                       struct tegra_tsensor *sensor,
>> +                       struct tsensor_shared_calibration shared)
>> +{
>> +     void * __iomem base = tegra->regs + sensor->base;
>> +     unsigned int val;
>> +     u32 calib;
>> +     int err;
>> +
>> +     err = calculate_tsensor_calibration(sensor, shared, &calib);
>> +     if (err)
>> +             return err;
>> +
>> +     val = 0;
>> +     val |= t124_tsensor_config.tall << SENSOR_CONFIG0_TALL_SHIFT;
>> +     writel(val, base + SENSOR_CONFIG0);
>> +
>> +     val = 0;
>> +     val |= (t124_tsensor_config.tsample - 1) <<
>> +             SENSOR_CONFIG1_TSAMPLE_SHIFT;
>> +     val |= t124_tsensor_config.tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
>> +     val |= t124_tsensor_config.ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
>> +     val |= SENSOR_CONFIG1_TEMP_ENABLE;
>> +     writel(val, base + SENSOR_CONFIG1);
>> +
>> +     writel(calib, base + SENSOR_CONFIG2);
>> +
>> +     return 0;
>> +}
>
> This code writes different values to SENSOR_CONFIG2 registers than what
> the NVIDIA's internal soc_therm driver does. Because the calibration
> value calculation should be based on the same fuses that NVIDIA's
> internal driver reads, I believe the value calculated and eventually
> written to SENSOR_CONFIG2 should be identical with NVIDIA's internal
> driver. That is not the case now.

Well, I would suggest using the hardware documentation as a base here.
I don't have access to the internal driver, thus, it is hard to judge
what is right, what is wrong.

>
> I first loaded the NVIDIA's internal soc_therm driver and then loaded
> code adopted from your driver running on Tegra's firmware. Here's a log
> of how the CONFIG2 values are changed by this code to different values
> than NVIDIA's internal soc_therm driver originally sets them to:
>
> CONFIG2: 5b0f813 -> 5c6f7fb
> CONFIG2: 5e8f7cd -> 5fff7b4
> CONFIG2: 5bdf7ce -> 5d3f7b5
> CONFIG2: 596f831 -> 5aaf81a
> CONFIG2: 675f6b5 -> 68cf698
> CONFIG2: 6d6f641 -> 6eff623
> CONFIG2: 641f72b -> 659f710
> CONFIG2: 590f861 -> 5a5f84a
>
> On the left, there's the value set by NVIDIA's internal soc_therm driver
> and on the right, there's the value set by code adopted from your
> driver. My modifications to the code are minor, and I don't think they
> could explain why the CONFIG2 values set are different.
>
> If you want them, I can supply you the values of the fuses on my
> development board.
>
> The temperature readings look sane and do vary with load, but I think
> they're a bit different than what NVIDIA's internal soc_therm driver

hmm..  Based on a single sample?

> gives. I'm not entirely sure if the calibration values set by your
> driver or the calibration values set by NVIDIA's internal soc_therm
> driver are correct, but it seems to me that only one of these drivers
> can be correct.

The calibration values may have strong influence on high temperatures,
which turns out to be the most critical situation on drivers like
this.

>
> The values written to CONFIG1 and CONFIG0 do seem sane.
>
> Since there are divisions being done, some sort of explicit protection
> from divisions by zero should perhaps be considered, although this can
> happen only if the fuses for some reason read all zeroes. I'm not sure
> if that's possible with real hardware.
>
> Where does this code come from? It does not definitely come from
> NVIDIA's internal soc_therm driver, as it looks entirely different. And
> it calculates different calibration values. If you wrote the code, you
> should consider commenting it since there are a lot of complex
> calculations going on that are not obvious to the reader. For example,
> what do "cp" and "ft" mean? Are they acronyms? If so, they should be
> explained.
>
> [snip]
>> +     for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
>> +             struct tegra_thermctl_zone *zone =
>> +                     devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
>> +             if (!zone) {
>> +                     err = -ENOMEM;
>
> Shouldn't this one have a --i line like the next IS_ERR block?
>
>> +                     goto unregister_tzs;
>> +             }
>> +
>> +             zone->temp_reg = tegra->regs + thermctl_temp_offsets[i];
>> +             zone->temp_shift = thermctl_temp_shifts[i];
>> +
>> +             tz = thermal_zone_of_sensor_register(
>> +                     &pdev->dev, i, zone, tegra_thermctl_get_temp, NULL);
>> +             if (IS_ERR(tz)) {
>> +                     err = PTR_ERR(tz);
>> +                     dev_err(&pdev->dev, "failed to register sensor: %d\n",
>> +                             err);
>> +                     --i;
>> +                     goto unregister_tzs;
>> +             }
>> +
>> +             tegra->thermctl_tzs[i] = tz;
>> +     }
Mikko Perttunen Aug. 19, 2014, 3:27 p.m. UTC | #4
On 08/19/2014 05:33 PM, edubezval@gmail.com wrote:
> Juha-Matti, moro,
>
> On Tue, Aug 19, 2014 at 10:09 AM, Juha-Matti Tilli
> <juha-matti.tilli@iki.fi> wrote:
>>> This adds support for the Tegra SOCTHERM thermal sensing and management
>>> system found in the Tegra124 system-on-chip. This initial driver supports
>>> temperature polling for four thermal zones.
>>
>> I have several comments about this patch. Overall, the code looks clean,
>> way cleaner than NVIDIA's internal soc_therm driver. I adopted your code
>> to run on the internal firmware of a Tegra124 SoC. Additionally, I
>> tested the code as-is on a Jetson TK1.
>
> Thanks for the testing effort!
>
>>
>> My test shows that the temperature readings look sane and do vary with
>> load, so the code seems to work. However, I have some concerns about the
>> calibration values calculated by this code and the error handling of the
>> code.
>>
>> Originally, I thought the fuse offsets were incorrect but as it turns
>> out, the official Linux kernel starts counting the fuses at a different
>> location than NVIDIA's internal kernel.
>>
>
> This is a major concern. Juha-Matti and Mikko, can you please cross
> check if this driver is accessing to the correct fuse register
> location?

It is. I made the same mistake earlier and fixed it.

>
>> [snip]
>>> +static int calculate_shared_calibration(struct tsensor_shared_calibration *r)
>>> +{
>>> +     u32 val;
>>> +     u32 shifted_cp, shifted_ft;
>>> +     int err;
>>> +
>>> +     err = tegra_fuse_readl(FUSE_TSENSOR8_CALIB, &val);
>>> +     if (err)
>>> +             return err;
>>> +     r->base_cp = val & FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK;
>>> +     r->base_ft = (val & FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK)
>>> +             >> FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT;
>>> +
>>> +     err = tegra_fuse_readl(FUSE_SPARE_REALIGNMENT_REG_0, &val);
>>> +     if (err)
>>> +             return err;
>>> +     shifted_cp = sign_extend32(val, 5);
>>> +     val = ((val & FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK)
>>> +             >> FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT);
>>> +     shifted_ft = sign_extend32(val, 4);
>>> +
>>> +     r->actual_temp_cp = 2 * NOMINAL_CALIB_CP_T124 + shifted_cp;
>>> +     r->actual_temp_ft = 2 * NOMINAL_CALIB_FT_T124 + shifted_ft;
>>> +
>>> +     return 0;
>>> +}
>>> +
>>> +static int calculate_tsensor_calibration(
>>> +     struct tegra_tsensor *sensor,
>>> +     struct tsensor_shared_calibration shared,
>>> +     u32 *calib
>>> +)
>>> +{
>>> +     u32 val;
>>> +     s32 actual_tsensor_ft, actual_tsensor_cp;
>>> +     s32 delta_sens, delta_temp;
>>> +     s32 mult, div;
>>> +     s16 therma, thermb;
>>> +     int err;
>>> +
>>> +     err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
>>> +     if (err)
>>> +             return err;
>>> +
>>> +     actual_tsensor_cp = (shared.base_cp * 64) + sign_extend32(val, 12);
>>> +     val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK)
>>> +             >> FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
>>> +     actual_tsensor_ft = (shared.base_ft * 32) + sign_extend32(val, 12);
>>> +
>>> +     delta_sens = actual_tsensor_ft - actual_tsensor_cp;
>>> +     delta_temp = shared.actual_temp_ft - shared.actual_temp_cp;
>>> +
>>> +     mult = t124_tsensor_config.pdiv * t124_tsensor_config.tsample_ate;
>>> +     div = t124_tsensor_config.tsample * t124_tsensor_config.pdiv_ate;
>>> +
>>> +     therma = div_s64((s64) delta_temp * (1LL << 13) * mult,
>>> +             (s64) delta_sens * div);
>>> +     thermb = div_s64(((s64) actual_tsensor_ft * shared.actual_temp_cp) -
>>> +             ((s64) actual_tsensor_cp * shared.actual_temp_ft),
>>> +             (s64) delta_sens);
>>> +
>>> +     therma = div_s64((s64) therma * sensor->fuse_corr_alpha,
>>> +             (s64) 1000000LL);
>>> +     thermb = div_s64((s64) thermb * sensor->fuse_corr_alpha +
>>> +             sensor->fuse_corr_beta,
>>> +             (s64) 1000000LL);
>>> +
>>> +     *calib = ((u16)(therma) << SENSOR_CONFIG2_THERMA_SHIFT) |
>>> +             ((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
>>> +
>>> +     return 0;
>>> +}
>>> +
>>> +static int enable_tsensor(struct tegra_soctherm *tegra,
>>> +                       struct tegra_tsensor *sensor,
>>> +                       struct tsensor_shared_calibration shared)
>>> +{
>>> +     void * __iomem base = tegra->regs + sensor->base;
>>> +     unsigned int val;
>>> +     u32 calib;
>>> +     int err;
>>> +
>>> +     err = calculate_tsensor_calibration(sensor, shared, &calib);
>>> +     if (err)
>>> +             return err;
>>> +
>>> +     val = 0;
>>> +     val |= t124_tsensor_config.tall << SENSOR_CONFIG0_TALL_SHIFT;
>>> +     writel(val, base + SENSOR_CONFIG0);
>>> +
>>> +     val = 0;
>>> +     val |= (t124_tsensor_config.tsample - 1) <<
>>> +             SENSOR_CONFIG1_TSAMPLE_SHIFT;
>>> +     val |= t124_tsensor_config.tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
>>> +     val |= t124_tsensor_config.ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
>>> +     val |= SENSOR_CONFIG1_TEMP_ENABLE;
>>> +     writel(val, base + SENSOR_CONFIG1);
>>> +
>>> +     writel(calib, base + SENSOR_CONFIG2);
>>> +
>>> +     return 0;
>>> +}
>>
>> This code writes different values to SENSOR_CONFIG2 registers than what
>> the NVIDIA's internal soc_therm driver does. Because the calibration
>> value calculation should be based on the same fuses that NVIDIA's
>> internal driver reads, I believe the value calculated and eventually
>> written to SENSOR_CONFIG2 should be identical with NVIDIA's internal
>> driver. That is not the case now.
>
> Well, I would suggest using the hardware documentation as a base here.
> I don't have access to the internal driver, thus, it is hard to judge
> what is right, what is wrong.

The hardware documentation (TRM) doesn't say anything about these 
registers, so I've mostly gone by the downstream driver. FYI, the driver 
is publicly available in the L4T kernel sources at 
https://developer.nvidia.com/sites/default/files/akamai/mobile/files/L4T/kernel_src.tbz2. 
I'm not how useful reading them would be, though.

>
>>
>> I first loaded the NVIDIA's internal soc_therm driver and then loaded
>> code adopted from your driver running on Tegra's firmware. Here's a log
>> of how the CONFIG2 values are changed by this code to different values
>> than NVIDIA's internal soc_therm driver originally sets them to:
>>
>> CONFIG2: 5b0f813 -> 5c6f7fb
>> CONFIG2: 5e8f7cd -> 5fff7b4
>> CONFIG2: 5bdf7ce -> 5d3f7b5
>> CONFIG2: 596f831 -> 5aaf81a
>> CONFIG2: 675f6b5 -> 68cf698
>> CONFIG2: 6d6f641 -> 6eff623
>> CONFIG2: 641f72b -> 659f710
>> CONFIG2: 590f861 -> 5a5f84a
>>
>> On the left, there's the value set by NVIDIA's internal soc_therm driver
>> and on the right, there's the value set by code adopted from your
>> driver. My modifications to the code are minor, and I don't think they
>> could explain why the CONFIG2 values set are different.
>>
>> If you want them, I can supply you the values of the fuses on my
>> development board.
>>
>> The temperature readings look sane and do vary with load, but I think
>> they're a bit different than what NVIDIA's internal soc_therm driver
>
> hmm..  Based on a single sample?
>
>> gives. I'm not entirely sure if the calibration values set by your
>> driver or the calibration values set by NVIDIA's internal soc_therm
>> driver are correct, but it seems to me that only one of these drivers
>> can be correct.
>
> The calibration values may have strong influence on high temperatures,
> which turns out to be the most critical situation on drivers like
> this.

I explained the difference in my other message, but I can add the 
"precise" version of the division if wanted. I'm not sure if the error 
scales with temperature.

>
>>
>> The values written to CONFIG1 and CONFIG0 do seem sane.
>>
>> Since there are divisions being done, some sort of explicit protection
>> from divisions by zero should perhaps be considered, although this can
>> happen only if the fuses for some reason read all zeroes. I'm not sure
>> if that's possible with real hardware.
>>
>> Where does this code come from? It does not definitely come from
>> NVIDIA's internal soc_therm driver, as it looks entirely different. And
>> it calculates different calibration values. If you wrote the code, you
>> should consider commenting it since there are a lot of complex
>> calculations going on that are not obvious to the reader. For example,
>> what do "cp" and "ft" mean? Are they acronyms? If so, they should be
>> explained.
>>
>> [snip]
>>> +     for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
>>> +             struct tegra_thermctl_zone *zone =
>>> +                     devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
>>> +             if (!zone) {
>>> +                     err = -ENOMEM;
>>
>> Shouldn't this one have a --i line like the next IS_ERR block?
>>
>>> +                     goto unregister_tzs;
>>> +             }
>>> +
>>> +             zone->temp_reg = tegra->regs + thermctl_temp_offsets[i];
>>> +             zone->temp_shift = thermctl_temp_shifts[i];
>>> +
>>> +             tz = thermal_zone_of_sensor_register(
>>> +                     &pdev->dev, i, zone, tegra_thermctl_get_temp, NULL);
>>> +             if (IS_ERR(tz)) {
>>> +                     err = PTR_ERR(tz);
>>> +                     dev_err(&pdev->dev, "failed to register sensor: %d\n",
>>> +                             err);
>>> +                     --i;
>>> +                     goto unregister_tzs;
>>> +             }
>>> +
>>> +             tegra->thermctl_tzs[i] = tz;
>>> +     }
>
>
>

Cheers,
Mikko

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juha-Matti Tilli Aug. 19, 2014, 6:25 p.m. UTC | #5
> > Originally, I thought the fuse offsets were incorrect but as it turns
> > out, the official Linux kernel starts counting the fuses at a different
> > location than NVIDIA's internal kernel.
> 
> This is a major concern. Juha-Matti and Mikko, can you please cross
> check if this driver is accessing to the correct fuse register
> location?

Mikko told me that the official Linux kernel and NVIDIA's internal
kernel have an offset of 0x100 between fuse registers. I believe it is
accessing correct fuse register locations, as I tested the code and it
works. But, if I port the code to a codebase that is using NVIDIA's
internal kernel functions, it fails (all fuses read 0x0) unless I offset
all fuses by 0x100. All fuse register locations in this code indeed have
the same offset of 0x100 between NVIDIA's internal docs & code and the
new driver code.

I'm not entirely sure why the official Linux kernel and NVIDIA's
internal code starts counting the fuses at a different location, but the
fuses read do have data in them and the temperature readings look sane,
so I assume they are indeed the correct fuses. The calculated
calibration values also seem to be similar to (but not identical with)
the ones calculated by NVIDIA's internal driver, but more on this later.

drivers/soc/tegra/fuse/fuse-tegra30.c in upstream has "#define
FUSE_BEGIN 0x100" and the read is "readl_relaxed(fuse_base + FUSE_BEGIN
+ offset);" so there's the explanation for the 0x100 offset.

I believe Mikko's explanation about the 0x100 offset, so the code in my
opinion seems correct as the offset of 0x100 is consistent, and the code
does in fact work. Reading incorrect fuses would mean the fuses read 0x0
and the temperatures are so strange that they are obviously incorrect.

Mikko said he did the same mistake earlier and fixed it. Well, I too did
the same mistake. I guess everyone working with fuses on both NVIDIA's
internal code and the upstream kernel will make the same mistake once
but not again.

> > The temperature readings look sane and do vary with load, but I think
> > they're a bit different than what NVIDIA's internal soc_therm driver
> 
> hmm..  Based on a single sample?

Yes, the comparison between NVIDIA's internal driver and this driver was
based only on a single sample. I don't now recall what the difference
was, but I should probably test it multiple times tomorrow so I can
report how large the difference is. Might be within the error margin, as
the temperatures are not stable to a 0.5 degrees C precision. A problem
here is that my test was basically loading code adopted from this new
driver dynamically on a system already using the NVIDIA's existing
driver as a static driver, so a new sample requires a reboot and there's
always a time delay between the commands used to manually read the
temperature, load the dynamic module and re-read the temperature. I
could tomorrow write a userspace test program to overwrite the registers
with the old and new values and read the temperature sensor register
immediately before and after the value switch to see how much the
readings differ. That way I could get multiple accurate data points with
no reboots required.

The test that the temperatures do vary with load was based on multiple
samples of running four shell infinite loops in the background. I can
easily reach 80 degrees C with that on my development board with passive
cooling. Killing the four loops results in the temperature slowly
trickling to the idle temperature. The idle temp seems to be a bit over
30 degrees C. My Jetson TK1 has active cooling, so the peak temperatures
are a lot lower than for the other development board. This loads only
the CPU, though. Loading additionally the GPU would cause higher
temperatures.

Mikko already noticed that NVIDIA's internal driver uses
div64_s64_precise and Mikko opted for a simpler implementation, so that
might be indeed the cause for the different calibration values. I'll
need to investigate this more. Yet another difference could be that
Mikko posted a link to an older version of the downstream kernel sources
(https://developer.nvidia.com/sites/default/files/akamai/mobile/files/L4T/kernel_src.tbz2)
-- if Mikko has based his code on that old code, it's possible that the
calibration value calculation has been improved afterwards. We'll know
for sure after I have had time to investigate this more.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juha-Matti Tilli Aug. 19, 2014, 6:25 p.m. UTC | #6
On Tue, Aug 19, 2014 at 06:27:04PM +0300, Mikko Perttunen wrote:
> > Well, I would suggest using the hardware documentation as a base here.
> > I don't have access to the internal driver, thus, it is hard to judge
> > what is right, what is wrong.
> 
> The hardware documentation (TRM) doesn't say anything about these 
> registers, so I've mostly gone by the downstream driver. FYI, the driver 
> is publicly available in the L4T kernel sources at 
> https://developer.nvidia.com/sites/default/files/akamai/mobile/files/L4T/kernel_src.tbz2. 
> I'm not how useful reading them would be, though.

I also haven't seen anything about the calibration calculation in any
hardware documentation so it may very well be the case that nothing has
been documented and the official information source is the internal
driver itself. Btw, this downstream driver code is most likely old code
as I can only find kernel/arch/arm/mach-tegra/tegra11_socterm.c in
there, and I do remember that on my development branch the file is named
differently. The CONFIG2 value comparison I posted is most likely
against a newer version of that code. I'm not sure if that new version
of that code is publicly available anywhere yet. I'll need to
investigate more if a newer version of that code calculates different
calibration values than an older version of that code. I'm currently
working on soc_therm for future chips, so I have plenty of time to
investigate this.

At least this internal driver can be used as a benchmark for code
clarity. Mikko's code is a lot more clear than this internal driver, so
that's why I decided to base my work on Mikko's code instead of this
internal driver and which is why I noticed the missing --i bug. But now
I'm wondering whether the calibration values are correct...
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juha-Matti Tilli Aug. 20, 2014, 12:05 p.m. UTC | #7
On Tue, Aug 19, 2014 at 10:33:13AM -0400, edubezval@gmail.com wrote:
> > The temperature readings look sane and do vary with load, but I think
> > they're a bit different than what NVIDIA's internal soc_therm driver
> 
> hmm..  Based on a single sample?

Now I have more than one sample. I wrote a test program to load the
CONFIG2 registers with the values NVIDIA's internal driver sets and also
with the values the patch sets. The difference on my development board
is between 3 and 4 degrees C, with the patch giving hotter readings than
the downstream driver. This is repeatable; I have made about ten test
runs and every time I can observe the difference.

As a matter of fact, I believe there are two other bugs in the patch in
addition to the --i error handling bug.

Firstly, shifted_ft seems to be read from the wrong register. It should
be read from FUSE_TSENSOR8_CALIB (and it's read from there in the
downstream driver), but Mikko's code erroneously reads it from
FUSE_SPARE_REALIGNMENT_REG_0. So, the downstream code has fields like
this:
FUSE_TSENSOR8_CALIB = zero-padding | shifted_ft | base_ft | base_cp
FUSE_SPARE_REALIGNMENT_REG_0 = zero-padding | shifted_cp

While the patch has the fields like that:
FUSE_TSENSOR8_CALIB = zero-padding | base_ft | base_cp
FUSE_SPARE_REALIGNMENT_REG_0 = zero-padding | shifted_ft | zero-padding  shifted_cp

Note the illogical zero-padding in the middle of
FUSE_SPARE_REALIGNMENT_REG_0 according to the patch.

As a result of that, the patch reads shifted_ft as 0 on my system even
though it should read -2, which is the value the downstream driver
reads. This also suggests that the patch is reading the incorrect
location as it gets all zeroes. This is the major reason for the
differing temperature readings between the downstream code and this
driver.

Secondly, .tsample_ate should be 480, not 481.

I communicated a patch to fix these issues to Mikko; he'll surely send a
version 5 of the patch with these issues fixed back to these mailing
lists.

As Mikko already noticed, yet another difference is that
div64_s64_precise is used in the downstream code instead of div_s64. I'm
not sure if that should be included in the code: it would mean higher
precision, but it would make the code more complicated. The difference
between div64_s64_precise and div_s64 is anyway small so either way is
fine with me.

Other than these issues, the code has been both tested by me and
reviewed by me and in my opinion it seems to have good code clarity and
be a good addition to the kernel, once the --i bug, the .tsample_ate bug
and the shifted_ft bug are fixed.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 693208e..fd9d049 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -175,6 +175,16 @@  config ARMADA_THERMAL
 	  Enable this option if you want to have support for thermal management
 	  controller present in Armada 370 and Armada XP SoC.
 
+config TEGRA_SOCTHERM
+	tristate "Tegra SOCTHERM thermal management"
+	depends on ARCH_TEGRA
+	help
+	  Enable this option for integrated thermal management support on NVIDIA
+	  Tegra124 systems-on-chip. The driver supports four thermal zones
+	  (CPU, GPU, MEM, PLLX). Cooling devices can be bound to the thermal
+	  zones to manage temperatures. This option is also required for the
+	  emergency thermal reset (thermtrip) feature to function.
+
 config DB8500_CPUFREQ_COOLING
 	tristate "DB8500 cpufreq cooling"
 	depends on ARCH_U8500
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 31e232f..f0b94d5 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -33,3 +33,4 @@  obj-$(CONFIG_INTEL_SOC_DTS_THERMAL)	+= intel_soc_dts_thermal.o
 obj-$(CONFIG_TI_SOC_THERMAL)	+= ti-soc-thermal/
 obj-$(CONFIG_ACPI_INT3403_THERMAL)	+= int3403_thermal.o
 obj-$(CONFIG_ST_THERMAL)	+= st/
+obj-$(CONFIG_TEGRA_SOCTHERM)	+= tegra_soctherm.o
diff --git a/drivers/thermal/tegra_soctherm.c b/drivers/thermal/tegra_soctherm.c
new file mode 100644
index 0000000..2d2e99b
--- /dev/null
+++ b/drivers/thermal/tegra_soctherm.c
@@ -0,0 +1,458 @@ 
+/*
+ * drivers/thermal/tegra_soctherm.c
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Author:
+ *	Mikko Perttunen <mperttunen@nvidia.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/thermal.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <soc/tegra/fuse.h>
+
+#define SENSOR_CONFIG0				0
+#define		SENSOR_CONFIG0_STOP		BIT(0)
+#define		SENSOR_CONFIG0_TALL_SHIFT	8
+#define		SENSOR_CONFIG0_TCALC_OVER	BIT(4)
+#define		SENSOR_CONFIG0_OVER		BIT(3)
+#define		SENSOR_CONFIG0_CPTR_OVER	BIT(2)
+#define SENSOR_CONFIG1				4
+#define		SENSOR_CONFIG1_TSAMPLE_SHIFT	0
+#define		SENSOR_CONFIG1_TIDDQ_EN_SHIFT	15
+#define		SENSOR_CONFIG1_TEN_COUNT_SHIFT	24
+#define		SENSOR_CONFIG1_TEMP_ENABLE	BIT(31)
+#define SENSOR_CONFIG2				8
+#define		SENSOR_CONFIG2_THERMA_SHIFT	16
+#define		SENSOR_CONFIG2_THERMB_SHIFT	0
+
+#define SENSOR_PDIV				0x1c0
+#define		SENSOR_PDIV_T124		0x8888
+#define SENSOR_HOTSPOT_OFF			0x1c4
+#define		SENSOR_HOTSPOT_OFF_T124		0x00060600
+#define SENSOR_TEMP1				0x1c8
+#define SENSOR_TEMP2				0x1cc
+
+#define SENSOR_TEMP_MASK			0xffff
+#define READBACK_VALUE_MASK			0xff00
+#define READBACK_VALUE_SHIFT			8
+#define READBACK_ADD_HALF			BIT(7)
+#define READBACK_NEGATE				BIT(1)
+
+#define FUSE_TSENSOR8_CALIB			0x180
+#define FUSE_SPARE_REALIGNMENT_REG_0		0x1fc
+
+#define FUSE_TSENSOR_CALIB_CP_TS_BASE_MASK	0x1fff
+#define FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK	(0x1fff << 13)
+#define FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT	13
+
+#define FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK	0x3ff
+#define FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK	(0x7ff << 10)
+#define FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT	10
+
+#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_CP_MASK 0x3f
+#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK (0x1f << 21)
+#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT 21
+
+#define NOMINAL_CALIB_FT_T124			105
+#define NOMINAL_CALIB_CP_T124			25
+
+struct tegra_tsensor_configuration {
+	u32 tall, tsample, tiddq_en, ten_count;
+	u32 pdiv, tsample_ate, pdiv_ate;
+};
+
+struct tegra_tsensor {
+	u32 base;
+	u32 calib_fuse_offset;
+	/* Correction values used to modify values read from calibration fuses */
+	s32 fuse_corr_alpha, fuse_corr_beta;
+};
+
+struct tegra_thermctl_zone {
+	void __iomem *temp_reg;
+	int temp_shift;
+};
+
+static const struct tegra_tsensor_configuration t124_tsensor_config = {
+	.tall = 16300,
+	.tsample = 120,
+	.tiddq_en = 1,
+	.ten_count = 1,
+	.pdiv = 8,
+	.tsample_ate = 481,
+	.pdiv_ate = 8
+};
+
+static const struct tegra_tsensor t124_tsensors[] = {
+	{
+		.base = 0xc0,
+		.calib_fuse_offset = 0x098,
+		.fuse_corr_alpha = 1135400,
+		.fuse_corr_beta = -6266900,
+	},
+	{
+		.base = 0xe0,
+		.calib_fuse_offset = 0x084,
+		.fuse_corr_alpha = 1122220,
+		.fuse_corr_beta = -5700700,
+	},
+	{
+		.base = 0x100,
+		.calib_fuse_offset = 0x088,
+		.fuse_corr_alpha = 1127000,
+		.fuse_corr_beta = -6768200,
+	},
+	{
+		.base = 0x120,
+		.calib_fuse_offset = 0x12c,
+		.fuse_corr_alpha = 1110900,
+		.fuse_corr_beta = -6232000,
+	},
+	{
+		.base = 0x140,
+		.calib_fuse_offset = 0x158,
+		.fuse_corr_alpha = 1122300,
+		.fuse_corr_beta = -5936400,
+	},
+	{
+		.base = 0x160,
+		.calib_fuse_offset = 0x15c,
+		.fuse_corr_alpha = 1145700,
+		.fuse_corr_beta = -7124600,
+	},
+	{
+		.base = 0x180,
+		.calib_fuse_offset = 0x154,
+		.fuse_corr_alpha = 1120100,
+		.fuse_corr_beta = -6000500,
+	},
+	{
+		.base = 0x1a0,
+		.calib_fuse_offset = 0x160,
+		.fuse_corr_alpha = 1106500,
+		.fuse_corr_beta = -6729300,
+	},
+};
+
+struct tegra_soctherm {
+	struct reset_control *reset;
+	struct clk *clock_tsensor;
+	struct clk *clock_soctherm;
+	void __iomem *regs;
+
+	struct thermal_zone_device *thermctl_tzs[4];
+};
+
+struct tsensor_shared_calibration {
+	u32 base_cp, base_ft;
+	u32 actual_temp_cp, actual_temp_ft;
+};
+
+static int calculate_shared_calibration(struct tsensor_shared_calibration *r)
+{
+	u32 val;
+	u32 shifted_cp, shifted_ft;
+	int err;
+
+	err = tegra_fuse_readl(FUSE_TSENSOR8_CALIB, &val);
+	if (err)
+		return err;
+	r->base_cp = val & FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK;
+	r->base_ft = (val & FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK)
+		>> FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT;
+
+	err = tegra_fuse_readl(FUSE_SPARE_REALIGNMENT_REG_0, &val);
+	if (err)
+		return err;
+	shifted_cp = sign_extend32(val, 5);
+	val = ((val & FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK)
+		>> FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT);
+	shifted_ft = sign_extend32(val, 4);
+
+	r->actual_temp_cp = 2 * NOMINAL_CALIB_CP_T124 + shifted_cp;
+	r->actual_temp_ft = 2 * NOMINAL_CALIB_FT_T124 + shifted_ft;
+
+	return 0;
+}
+
+static int calculate_tsensor_calibration(
+	struct tegra_tsensor *sensor,
+	struct tsensor_shared_calibration shared,
+	u32 *calib
+)
+{
+	u32 val;
+	s32 actual_tsensor_ft, actual_tsensor_cp;
+	s32 delta_sens, delta_temp;
+	s32 mult, div;
+	s16 therma, thermb;
+	int err;
+
+	err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
+	if (err)
+		return err;
+
+	actual_tsensor_cp = (shared.base_cp * 64) + sign_extend32(val, 12);
+	val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK)
+		>> FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
+	actual_tsensor_ft = (shared.base_ft * 32) + sign_extend32(val, 12);
+
+	delta_sens = actual_tsensor_ft - actual_tsensor_cp;
+	delta_temp = shared.actual_temp_ft - shared.actual_temp_cp;
+
+	mult = t124_tsensor_config.pdiv * t124_tsensor_config.tsample_ate;
+	div = t124_tsensor_config.tsample * t124_tsensor_config.pdiv_ate;
+
+	therma = div_s64((s64) delta_temp * (1LL << 13) * mult,
+		(s64) delta_sens * div);
+	thermb = div_s64(((s64) actual_tsensor_ft * shared.actual_temp_cp) -
+		((s64) actual_tsensor_cp * shared.actual_temp_ft),
+		(s64) delta_sens);
+
+	therma = div_s64((s64) therma * sensor->fuse_corr_alpha,
+		(s64) 1000000LL);
+	thermb = div_s64((s64) thermb * sensor->fuse_corr_alpha +
+		sensor->fuse_corr_beta,
+		(s64) 1000000LL);
+
+	*calib = ((u16)(therma) << SENSOR_CONFIG2_THERMA_SHIFT) |
+		((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
+
+	return 0;
+}
+
+static int enable_tsensor(struct tegra_soctherm *tegra,
+			  struct tegra_tsensor *sensor,
+			  struct tsensor_shared_calibration shared)
+{
+	void * __iomem base = tegra->regs + sensor->base;
+	unsigned int val;
+	u32 calib;
+	int err;
+
+	err = calculate_tsensor_calibration(sensor, shared, &calib);
+	if (err)
+		return err;
+
+	val = 0;
+	val |= t124_tsensor_config.tall << SENSOR_CONFIG0_TALL_SHIFT;
+	writel(val, base + SENSOR_CONFIG0);
+
+	val = 0;
+	val |= (t124_tsensor_config.tsample - 1) <<
+		SENSOR_CONFIG1_TSAMPLE_SHIFT;
+	val |= t124_tsensor_config.tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
+	val |= t124_tsensor_config.ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
+	val |= SENSOR_CONFIG1_TEMP_ENABLE;
+	writel(val, base + SENSOR_CONFIG1);
+
+	writel(calib, base + SENSOR_CONFIG2);
+
+	return 0;
+}
+
+/* Translate from soctherm readback format to millicelsius.
+ * The soctherm readback format in bits is as follows:
+ *   TTTTTTTT H______N
+ * where T's contain the temperature in Celsius,
+ * H denotes an addition of 0.5 Celsius and N denotes negation
+ * of the final value.
+ */
+static inline long translate_temp(u16 val)
+{
+	long t;
+
+	t = ((val & READBACK_VALUE_MASK) >> READBACK_VALUE_SHIFT) * 1000;
+	if (val & READBACK_ADD_HALF)
+		t += 500;
+	if (val & READBACK_NEGATE)
+		t *= -1;
+
+	return t;
+}
+
+static int tegra_thermctl_get_temp(void *data, long *out_temp)
+{
+	struct tegra_thermctl_zone *zone = data;
+	u32 val;
+
+	val = (readl(zone->temp_reg) >> zone->temp_shift) & SENSOR_TEMP_MASK;
+	*out_temp = translate_temp(val);
+
+	return 0;
+}
+
+static struct of_device_id tegra_soctherm_of_match[] = {
+	{ .compatible = "nvidia,tegra124-soctherm" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, tegra_soctherm_of_match);
+
+static int thermctl_temp_offsets[] = {
+	SENSOR_TEMP1, SENSOR_TEMP2, SENSOR_TEMP1, SENSOR_TEMP2
+};
+
+static int thermctl_temp_shifts[] = {
+	16, 16, 0, 0
+};
+
+static int tegra_soctherm_probe(struct platform_device *pdev)
+{
+	struct tegra_soctherm *tegra;
+	struct thermal_zone_device *tz;
+	struct tsensor_shared_calibration shared_calib;
+	int i;
+	int err = 0;
+
+	struct tegra_tsensor *tsensors = t124_tsensors;
+
+	tegra = devm_kzalloc(&pdev->dev, sizeof(*tegra), GFP_KERNEL);
+	if (!tegra)
+		return -ENOMEM;
+
+	tegra->regs = devm_ioremap_resource(&pdev->dev,
+		platform_get_resource(pdev, IORESOURCE_MEM, 0));
+	if (IS_ERR(tegra->regs)) {
+		dev_err(&pdev->dev, "can't get registers");
+		return PTR_ERR(tegra->regs);
+	}
+
+	tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm");
+	if (IS_ERR(tegra->reset)) {
+		dev_err(&pdev->dev, "can't get soctherm reset\n");
+		return PTR_ERR(tegra->reset);
+	}
+
+	tegra->clock_tsensor = devm_clk_get(&pdev->dev, "tsensor");
+	if (IS_ERR(tegra->clock_tsensor)) {
+		dev_err(&pdev->dev, "can't get clock tsensor\n");
+		return PTR_ERR(tegra->clock_tsensor);
+	}
+
+	tegra->clock_soctherm = devm_clk_get(&pdev->dev, "soctherm");
+	if (IS_ERR(tegra->clock_soctherm)) {
+		dev_err(&pdev->dev, "can't get clock soctherm\n");
+		return PTR_ERR(tegra->clock_soctherm);
+	}
+
+	reset_control_assert(tegra->reset);
+
+	err = clk_prepare_enable(tegra->clock_soctherm);
+	if (err) {
+		reset_control_deassert(tegra->reset);
+		return err;
+	}
+
+	err = clk_prepare_enable(tegra->clock_tsensor);
+	if (err) {
+		clk_disable_unprepare(tegra->clock_soctherm);
+		reset_control_deassert(tegra->reset);
+		return err;
+	}
+
+	reset_control_deassert(tegra->reset);
+
+	/* Initialize raw sensors */
+
+	err = calculate_shared_calibration(&shared_calib);
+	if (err)
+		goto disable_clocks;
+
+	for (i = 0; i < ARRAY_SIZE(t124_tsensors); ++i) {
+		err = enable_tsensor(tegra, tsensors + i, shared_calib);
+		if (err)
+			goto disable_clocks;
+	}
+
+	writel(SENSOR_PDIV_T124, tegra->regs + SENSOR_PDIV);
+	writel(SENSOR_HOTSPOT_OFF_T124, tegra->regs + SENSOR_HOTSPOT_OFF);
+
+	/* Initialize thermctl sensors */
+
+	for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
+		struct tegra_thermctl_zone *zone =
+			devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
+		if (!zone) {
+			err = -ENOMEM;
+			goto unregister_tzs;
+		}
+
+		zone->temp_reg = tegra->regs + thermctl_temp_offsets[i];
+		zone->temp_shift = thermctl_temp_shifts[i];
+
+		tz = thermal_zone_of_sensor_register(
+			&pdev->dev, i, zone, tegra_thermctl_get_temp, NULL);
+		if (IS_ERR(tz)) {
+			err = PTR_ERR(tz);
+			dev_err(&pdev->dev, "failed to register sensor: %d\n",
+				err);
+			--i;
+			goto unregister_tzs;
+		}
+
+		tegra->thermctl_tzs[i] = tz;
+	}
+
+	return 0;
+
+unregister_tzs:
+	for (; i >= 0; i--)
+		thermal_zone_of_sensor_unregister(&pdev->dev,
+						  tegra->thermctl_tzs[i]);
+
+disable_clocks:
+	clk_disable_unprepare(tegra->clock_tsensor);
+	clk_disable_unprepare(tegra->clock_soctherm);
+
+	return err;
+}
+
+static int tegra_soctherm_remove(struct platform_device *pdev)
+{
+	struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
+		thermal_zone_of_sensor_unregister(&pdev->dev,
+						  tegra->thermctl_tzs[i]);
+	}
+
+	clk_disable_unprepare(tegra->clock_tsensor);
+	clk_disable_unprepare(tegra->clock_soctherm);
+
+	return 0;
+}
+
+static struct platform_driver tegra_soctherm_driver = {
+	.probe = tegra_soctherm_probe,
+	.remove = tegra_soctherm_remove,
+	.driver = {
+		.name = "tegra_soctherm",
+		.of_match_table = tegra_soctherm_of_match,
+	},
+};
+module_platform_driver(tegra_soctherm_driver);
+
+MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
+MODULE_DESCRIPTION("Tegra SOCTHERM thermal management driver");
+MODULE_LICENSE("GPL v2");