diff mbox series

[v3] drm/i915/selftests: Refactor RC6 power measurement and error handling

Message ID 20250306191110.2582025-1-sk.anirban@intel.com (mailing list archive)
State New
Headers show
Series [v3] drm/i915/selftests: Refactor RC6 power measurement and error handling | expand

Commit Message

Anirban, Sk March 6, 2025, 7:11 p.m. UTC
From: Sk Anirban <sk.anirban@intel.com>

Refactor power measurement logic to store and compare energy values.
Introduce a threshold check to ensure the GPU enters RC6 properly.

v2:
  - Improved commit message (Badal)

v3:
 - Reorder threshold check (Badal)

Signed-off-by: Sk Anirban <sk.anirban@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_rc6.c | 59 +++++++++++++++++---------
 1 file changed, 38 insertions(+), 21 deletions(-)

Comments

Nilawar, Badal March 11, 2025, 11:17 a.m. UTC | #1
On 07-03-2025 00:41, sk.anirban@intel.com wrote:
> From: Sk Anirban <sk.anirban@intel.com>
>
> Refactor power measurement logic to store and compare energy values.
> Introduce a threshold check to ensure the GPU enters RC6 properly.
>
> v2:
>    - Improved commit message (Badal)
>
> v3:
>   - Reorder threshold check (Badal)
>
> Signed-off-by: Sk Anirban <sk.anirban@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/selftest_rc6.c | 59 +++++++++++++++++---------
>   1 file changed, 38 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c
> index 908483ab0bc8..5364e50be638 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
> @@ -33,15 +33,20 @@ int live_rc6_manual(void *arg)
>   {
>   	struct intel_gt *gt = arg;
>   	struct intel_rc6 *rc6 = &gt->rc6;
> -	u64 rc0_power, rc6_power;
> +	struct intel_rps *rps = &gt->rps;
>   	intel_wakeref_t wakeref;
> +	u64 sleep_time = 1000;
> +	u32 rc0_freq = 0;
> +	u32 rc6_freq = 0;
> +	u64 rc0_power[3];
> +	u64 rc6_power[3];
>   	bool has_power;
> +	u64 threshold;
>   	ktime_t dt;
>   	u64 res[2];
>   	int err = 0;
> -	u32 rc0_freq = 0;
> -	u32 rc6_freq = 0;
> -	struct intel_rps *rps = &gt->rps;
> +	u64 diff;
> +
>   
>   	/*
>   	 * Our claim is that we can "encourage" the GPU to enter rc6 at will.
> @@ -65,9 +70,9 @@ int live_rc6_manual(void *arg)
>   	res[0] = rc6_residency(rc6);
>   
>   	dt = ktime_get();
> -	rc0_power = librapl_energy_uJ();
> -	msleep(1000);
> -	rc0_power = librapl_energy_uJ() - rc0_power;
> +	rc0_power[0] = librapl_energy_uJ();
> +	msleep(sleep_time);
> +	rc0_power[1] = librapl_energy_uJ() - rc0_power[0];
>   	dt = ktime_sub(ktime_get(), dt);
>   	res[1] = rc6_residency(rc6);
>   	rc0_freq = intel_rps_read_actual_frequency_fw(rps);
> @@ -79,11 +84,12 @@ int live_rc6_manual(void *arg)
>   	}
>   
>   	if (has_power) {
> -		rc0_power = div64_u64(NSEC_PER_SEC * rc0_power,
> -				      ktime_to_ns(dt));
> -		if (!rc0_power) {
> +		rc0_power[2] = div64_u64(NSEC_PER_SEC * rc0_power[1],
> +					 ktime_to_ns(dt));
> +
> +		if (!rc0_power[2]) {
>   			if (rc0_freq)
> -				pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n",
> +				pr_debug("No power measured while in RC0! GPU Freq: %uMHz in RC0\n",
>   					 rc0_freq);
>   			else
>   				pr_err("No power and freq measured while in RC0\n");
> @@ -98,10 +104,10 @@ int live_rc6_manual(void *arg)
>   	res[0] = rc6_residency(rc6);
>   	intel_uncore_forcewake_flush(rc6_to_uncore(rc6), FORCEWAKE_ALL);
>   	dt = ktime_get();
> -	rc6_power = librapl_energy_uJ();
> -	msleep(1000);
> +	rc6_power[0] = librapl_energy_uJ();
> +	msleep(sleep_time);
>   	rc6_freq = intel_rps_read_actual_frequency_fw(rps);
> -	rc6_power = librapl_energy_uJ() - rc6_power;
> +	rc6_power[1] = librapl_energy_uJ() - rc6_power[0];
>   	dt = ktime_sub(ktime_get(), dt);
>   	res[1] = rc6_residency(rc6);
>   	if (res[1] == res[0]) {
> @@ -113,13 +119,24 @@ int live_rc6_manual(void *arg)
>   	}
>   
>   	if (has_power) {
> -		rc6_power = div64_u64(NSEC_PER_SEC * rc6_power,
> -				      ktime_to_ns(dt));
> -		pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
> -			rc0_power, rc6_power);
> -		if (2 * rc6_power > rc0_power) {
> -			pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n",
> -			       rc6_freq, rc0_freq);
> +		rc6_power[2] = div64_u64(NSEC_PER_SEC * rc6_power[1],
> +					 ktime_to_ns(dt));
> +		pr_info("GPU consumed %lluuW in RC0 and %lluuW in RC6\n",
> +			rc0_power[2], rc6_power[2]);
> +
> +		if (2 * rc6_power[2] > rc0_power[2]) {
> +			pr_err("GPU leaked energy while in RC6!\n"
> +			       "GPU Freq: %uMHz in RC6 and %uMHz in RC0\n"
> +			       "RC0 energy before & after sleep respectively: %lluuJ %lluuJ\n"
> +			       "RC6 energy before & after sleep respectively: %lluuJ %lluuJ\n",
> +			       rc6_freq, rc0_freq, rc0_power[0], rc0_power[1],
> +			       rc6_power[0], rc6_power[1]);
> +
> +			diff = res[1] - res[0];
> +			threshold = (9 * NSEC_PER_MSEC * sleep_time) / 10;
> +			if (diff < threshold)
> +				pr_err("Did not enter RC6 properly, RC6 start residency=%lluns, RC6 end residency=%lluns\n",
> +				       res[0], res[1]);

Check if BAT failures reported are related. Similar errors were seen 
with other selftest related patches too.
Otherwise this looks good to me.

Reviewed-by: Badal Nilawar <badal.nilawar@intel.com>

Regards,
Badal

>   			err = -EINVAL;
>   			goto out_unlock;
>   		}
Anirban, Sk March 11, 2025, 4:17 p.m. UTC | #2
On 11-03-2025 16:47, Nilawar, Badal wrote:
>
> On 07-03-2025 00:41, sk.anirban@intel.com wrote:
>> From: Sk Anirban <sk.anirban@intel.com>
>>
>> Refactor power measurement logic to store and compare energy values.
>> Introduce a threshold check to ensure the GPU enters RC6 properly.
>>
>> v2:
>>    - Improved commit message (Badal)
>>
>> v3:
>>   - Reorder threshold check (Badal)
>>
>> Signed-off-by: Sk Anirban <sk.anirban@intel.com>
>> ---
>>   drivers/gpu/drm/i915/gt/selftest_rc6.c | 59 +++++++++++++++++---------
>>   1 file changed, 38 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c 
>> b/drivers/gpu/drm/i915/gt/selftest_rc6.c
>> index 908483ab0bc8..5364e50be638 100644
>> --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
>> +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
>> @@ -33,15 +33,20 @@ int live_rc6_manual(void *arg)
>>   {
>>       struct intel_gt *gt = arg;
>>       struct intel_rc6 *rc6 = &gt->rc6;
>> -    u64 rc0_power, rc6_power;
>> +    struct intel_rps *rps = &gt->rps;
>>       intel_wakeref_t wakeref;
>> +    u64 sleep_time = 1000;
>> +    u32 rc0_freq = 0;
>> +    u32 rc6_freq = 0;
>> +    u64 rc0_power[3];
>> +    u64 rc6_power[3];
>>       bool has_power;
>> +    u64 threshold;
>>       ktime_t dt;
>>       u64 res[2];
>>       int err = 0;
>> -    u32 rc0_freq = 0;
>> -    u32 rc6_freq = 0;
>> -    struct intel_rps *rps = &gt->rps;
>> +    u64 diff;
>> +
>>         /*
>>        * Our claim is that we can "encourage" the GPU to enter rc6 at 
>> will.
>> @@ -65,9 +70,9 @@ int live_rc6_manual(void *arg)
>>       res[0] = rc6_residency(rc6);
>>         dt = ktime_get();
>> -    rc0_power = librapl_energy_uJ();
>> -    msleep(1000);
>> -    rc0_power = librapl_energy_uJ() - rc0_power;
>> +    rc0_power[0] = librapl_energy_uJ();
>> +    msleep(sleep_time);
>> +    rc0_power[1] = librapl_energy_uJ() - rc0_power[0];
>>       dt = ktime_sub(ktime_get(), dt);
>>       res[1] = rc6_residency(rc6);
>>       rc0_freq = intel_rps_read_actual_frequency_fw(rps);
>> @@ -79,11 +84,12 @@ int live_rc6_manual(void *arg)
>>       }
>>         if (has_power) {
>> -        rc0_power = div64_u64(NSEC_PER_SEC * rc0_power,
>> -                      ktime_to_ns(dt));
>> -        if (!rc0_power) {
>> +        rc0_power[2] = div64_u64(NSEC_PER_SEC * rc0_power[1],
>> +                     ktime_to_ns(dt));
>> +
>> +        if (!rc0_power[2]) {
>>               if (rc0_freq)
>> -                pr_debug("No power measured while in RC0! GPU Freq: 
>> %u in RC0\n",
>> +                pr_debug("No power measured while in RC0! GPU Freq: 
>> %uMHz in RC0\n",
>>                        rc0_freq);
>>               else
>>                   pr_err("No power and freq measured while in RC0\n");
>> @@ -98,10 +104,10 @@ int live_rc6_manual(void *arg)
>>       res[0] = rc6_residency(rc6);
>>       intel_uncore_forcewake_flush(rc6_to_uncore(rc6), FORCEWAKE_ALL);
>>       dt = ktime_get();
>> -    rc6_power = librapl_energy_uJ();
>> -    msleep(1000);
>> +    rc6_power[0] = librapl_energy_uJ();
>> +    msleep(sleep_time);
>>       rc6_freq = intel_rps_read_actual_frequency_fw(rps);
>> -    rc6_power = librapl_energy_uJ() - rc6_power;
>> +    rc6_power[1] = librapl_energy_uJ() - rc6_power[0];
>>       dt = ktime_sub(ktime_get(), dt);
>>       res[1] = rc6_residency(rc6);
>>       if (res[1] == res[0]) {
>> @@ -113,13 +119,24 @@ int live_rc6_manual(void *arg)
>>       }
>>         if (has_power) {
>> -        rc6_power = div64_u64(NSEC_PER_SEC * rc6_power,
>> -                      ktime_to_ns(dt));
>> -        pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
>> -            rc0_power, rc6_power);
>> -        if (2 * rc6_power > rc0_power) {
>> -            pr_err("GPU leaked energy while in RC6! GPU Freq: %u in 
>> RC6 and %u in RC0\n",
>> -                   rc6_freq, rc0_freq);
>> +        rc6_power[2] = div64_u64(NSEC_PER_SEC * rc6_power[1],
>> +                     ktime_to_ns(dt));
>> +        pr_info("GPU consumed %lluuW in RC0 and %lluuW in RC6\n",
>> +            rc0_power[2], rc6_power[2]);
>> +
>> +        if (2 * rc6_power[2] > rc0_power[2]) {
>> +            pr_err("GPU leaked energy while in RC6!\n"
>> +                   "GPU Freq: %uMHz in RC6 and %uMHz in RC0\n"
>> +                   "RC0 energy before & after sleep respectively: 
>> %lluuJ %lluuJ\n"
>> +                   "RC6 energy before & after sleep respectively: 
>> %lluuJ %lluuJ\n",
>> +                   rc6_freq, rc0_freq, rc0_power[0], rc0_power[1],
>> +                   rc6_power[0], rc6_power[1]);
>> +
>> +            diff = res[1] - res[0];
>> +            threshold = (9 * NSEC_PER_MSEC * sleep_time) / 10;
>> +            if (diff < threshold)
>> +                pr_err("Did not enter RC6 properly, RC6 start 
>> residency=%lluns, RC6 end residency=%lluns\n",
>> +                       res[0], res[1]);
>
> Check if BAT failures reported are related. Similar errors were seen 
> with other selftest related patches too.
> Otherwise this looks good to me.
>
> Reviewed-by: Badal Nilawar <badal.nilawar@intel.com>
>
> Regards,
> Badal
BAT failures were not related to these changes and re-reported.

Thanks,
Anirban
>
>>               err = -EINVAL;
>>               goto out_unlock;
>>           }
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c
index 908483ab0bc8..5364e50be638 100644
--- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
+++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
@@ -33,15 +33,20 @@  int live_rc6_manual(void *arg)
 {
 	struct intel_gt *gt = arg;
 	struct intel_rc6 *rc6 = &gt->rc6;
-	u64 rc0_power, rc6_power;
+	struct intel_rps *rps = &gt->rps;
 	intel_wakeref_t wakeref;
+	u64 sleep_time = 1000;
+	u32 rc0_freq = 0;
+	u32 rc6_freq = 0;
+	u64 rc0_power[3];
+	u64 rc6_power[3];
 	bool has_power;
+	u64 threshold;
 	ktime_t dt;
 	u64 res[2];
 	int err = 0;
-	u32 rc0_freq = 0;
-	u32 rc6_freq = 0;
-	struct intel_rps *rps = &gt->rps;
+	u64 diff;
+
 
 	/*
 	 * Our claim is that we can "encourage" the GPU to enter rc6 at will.
@@ -65,9 +70,9 @@  int live_rc6_manual(void *arg)
 	res[0] = rc6_residency(rc6);
 
 	dt = ktime_get();
-	rc0_power = librapl_energy_uJ();
-	msleep(1000);
-	rc0_power = librapl_energy_uJ() - rc0_power;
+	rc0_power[0] = librapl_energy_uJ();
+	msleep(sleep_time);
+	rc0_power[1] = librapl_energy_uJ() - rc0_power[0];
 	dt = ktime_sub(ktime_get(), dt);
 	res[1] = rc6_residency(rc6);
 	rc0_freq = intel_rps_read_actual_frequency_fw(rps);
@@ -79,11 +84,12 @@  int live_rc6_manual(void *arg)
 	}
 
 	if (has_power) {
-		rc0_power = div64_u64(NSEC_PER_SEC * rc0_power,
-				      ktime_to_ns(dt));
-		if (!rc0_power) {
+		rc0_power[2] = div64_u64(NSEC_PER_SEC * rc0_power[1],
+					 ktime_to_ns(dt));
+
+		if (!rc0_power[2]) {
 			if (rc0_freq)
-				pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n",
+				pr_debug("No power measured while in RC0! GPU Freq: %uMHz in RC0\n",
 					 rc0_freq);
 			else
 				pr_err("No power and freq measured while in RC0\n");
@@ -98,10 +104,10 @@  int live_rc6_manual(void *arg)
 	res[0] = rc6_residency(rc6);
 	intel_uncore_forcewake_flush(rc6_to_uncore(rc6), FORCEWAKE_ALL);
 	dt = ktime_get();
-	rc6_power = librapl_energy_uJ();
-	msleep(1000);
+	rc6_power[0] = librapl_energy_uJ();
+	msleep(sleep_time);
 	rc6_freq = intel_rps_read_actual_frequency_fw(rps);
-	rc6_power = librapl_energy_uJ() - rc6_power;
+	rc6_power[1] = librapl_energy_uJ() - rc6_power[0];
 	dt = ktime_sub(ktime_get(), dt);
 	res[1] = rc6_residency(rc6);
 	if (res[1] == res[0]) {
@@ -113,13 +119,24 @@  int live_rc6_manual(void *arg)
 	}
 
 	if (has_power) {
-		rc6_power = div64_u64(NSEC_PER_SEC * rc6_power,
-				      ktime_to_ns(dt));
-		pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
-			rc0_power, rc6_power);
-		if (2 * rc6_power > rc0_power) {
-			pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n",
-			       rc6_freq, rc0_freq);
+		rc6_power[2] = div64_u64(NSEC_PER_SEC * rc6_power[1],
+					 ktime_to_ns(dt));
+		pr_info("GPU consumed %lluuW in RC0 and %lluuW in RC6\n",
+			rc0_power[2], rc6_power[2]);
+
+		if (2 * rc6_power[2] > rc0_power[2]) {
+			pr_err("GPU leaked energy while in RC6!\n"
+			       "GPU Freq: %uMHz in RC6 and %uMHz in RC0\n"
+			       "RC0 energy before & after sleep respectively: %lluuJ %lluuJ\n"
+			       "RC6 energy before & after sleep respectively: %lluuJ %lluuJ\n",
+			       rc6_freq, rc0_freq, rc0_power[0], rc0_power[1],
+			       rc6_power[0], rc6_power[1]);
+
+			diff = res[1] - res[0];
+			threshold = (9 * NSEC_PER_MSEC * sleep_time) / 10;
+			if (diff < threshold)
+				pr_err("Did not enter RC6 properly, RC6 start residency=%lluns, RC6 end residency=%lluns\n",
+				       res[0], res[1]);
 			err = -EINVAL;
 			goto out_unlock;
 		}