diff mbox series

[v5] drm/i915/selftests: Implement frequency logging for energy reading validation

Message ID 20241010173439.2006496-1-sk.anirban@intel.com (mailing list archive)
State New
Headers show
Series [v5] drm/i915/selftests: Implement frequency logging for energy reading validation | expand

Commit Message

Anirban, Sk Oct. 10, 2024, 5:34 p.m. UTC
Introduce RC6 & RC0 frequency logging mechanism to ensure accurate
energy readings aimed at addressing GPU energy leaks and power
measurement failures.
This enhancement will help ensure the accuracy of energy readings.

v2:
  - Improved commit message.
v3:
  - Used pr_err log to display frequency (Anshuman)
  - Sorted headers alphabetically (Sai Teja)
v4:
  - Improved commit message
  - Fix pr_err log (Sai Teja)
v5:
  -Add error & debug logging for RC0 power and frequency checks (Anshuman)

Signed-off-by: Sk Anirban <sk.anirban@intel.com>
Reviewed-by: Sai Teja Pottumuttu <sai.teja.pottumuttu@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_rc6.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

Comments

Pottumuttu, Sai Teja Oct. 10, 2024, 6:15 p.m. UTC | #1
On 10-10-2024 23:04, Sk Anirban wrote:
> Introduce RC6 & RC0 frequency logging mechanism to ensure accurate
> energy readings aimed at addressing GPU energy leaks and power
> measurement failures.
> This enhancement will help ensure the accuracy of energy readings.
>
> v2:
>    - Improved commit message.
> v3:
>    - Used pr_err log to display frequency (Anshuman)
>    - Sorted headers alphabetically (Sai Teja)
> v4:
>    - Improved commit message
>    - Fix pr_err log (Sai Teja)
> v5:
>    -Add error & debug logging for RC0 power and frequency checks (Anshuman)
>
> Signed-off-by: Sk Anirban <sk.anirban@intel.com>
> Reviewed-by: Sai Teja Pottumuttu <sai.teja.pottumuttu@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/selftest_rc6.c | 20 ++++++++++++++++----
>   1 file changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c
> index 1aa1446c8fb0..0cf86fed39ca 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
> @@ -8,6 +8,7 @@
>   #include "intel_gpu_commands.h"
>   #include "intel_gt_requests.h"
>   #include "intel_ring.h"
> +#include "intel_rps.h"
>   #include "selftest_rc6.h"
>   
>   #include "selftests/i915_random.h"
> @@ -38,6 +39,9 @@ int live_rc6_manual(void *arg)
>   	ktime_t dt;
>   	u64 res[2];
>   	int err = 0;
> +	u32 rc0_freq = 0;
> +	u32 rc6_freq = 0;
> +	struct intel_rps *rps = &gt->rps;
>   
>   	/*
>   	 * Our claim is that we can "encourage" the GPU to enter rc6 at will.
> @@ -66,6 +70,7 @@ int live_rc6_manual(void *arg)
>   	rc0_power = librapl_energy_uJ() - rc0_power;
>   	dt = ktime_sub(ktime_get(), dt);
>   	res[1] = rc6_residency(rc6);
> +	rc0_freq = intel_rps_read_actual_frequency(rps);
>   	if ((res[1] - res[0]) >> 10) {
>   		pr_err("RC6 residency increased by %lldus while disabled for 1000ms!\n",
>   		       (res[1] - res[0]) >> 10);
> @@ -77,9 +82,14 @@ int live_rc6_manual(void *arg)
>   		rc0_power = div64_u64(NSEC_PER_SEC * rc0_power,
>   				      ktime_to_ns(dt));
>   		if (!rc0_power) {
> -			pr_err("No power measured while in RC0\n");
> -			err = -EINVAL;
> -			goto out_unlock;
> +			if (rc0_freq)
> +				pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n",
> +			rc0_freq);
This line is not indented properly.
> +			else {
> +				pr_err("No power and freq measured while in RC0\n");
> +				err = -EINVAL;
> +				goto out_unlock;
AFAIU we should have the EINVAL and goto out_unlock irrespective of 
rc0_freq. Reason being, if rc0_power is not measured then the comparison 
we are doing with rc6_power below becomes invalid so I think we 
shouldn't proceed if rc0_power is not measured and just goto out_unlock 
with EINVAL. Am I missing something?
> +			}
>   		}
>   	}
>   
> @@ -91,6 +101,7 @@ int live_rc6_manual(void *arg)
>   	dt = ktime_get();
>   	rc6_power = librapl_energy_uJ();
>   	msleep(100);
> +	rc6_freq = intel_rps_read_actual_frequency(rps);
>   	rc6_power = librapl_energy_uJ() - rc6_power;
>   	dt = ktime_sub(ktime_get(), dt);
>   	res[1] = rc6_residency(rc6);
> @@ -108,7 +119,8 @@ int live_rc6_manual(void *arg)
>   		pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
>   			rc0_power, rc6_power);
>   		if (2 * rc6_power > rc0_power) {
> -			pr_err("GPU leaked energy while in RC6!\n");
> +			pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n",
> +			       rc6_freq, rc0_freq);
>   			err = -EINVAL;
>   			goto out_unlock;
>   		}
Anirban, Sk Oct. 11, 2024, 12:36 p.m. UTC | #2
On 11-10-2024 11:52, Anirban, Sk wrote:
>
>
> On 10-10-2024 23:45, Pottumuttu, Sai Teja wrote:
>>
>> On 10-10-2024 23:04, Sk Anirban wrote:
>>> Introduce RC6 & RC0 frequency logging mechanism to ensure accurate
>>> energy readings aimed at addressing GPU energy leaks and power
>>> measurement failures.
>>> This enhancement will help ensure the accuracy of energy readings.
>>>
>>> v2:
>>>    - Improved commit message.
>>> v3:
>>>    - Used pr_err log to display frequency (Anshuman)
>>>    - Sorted headers alphabetically (Sai Teja)
>>> v4:
>>>    - Improved commit message
>>>    - Fix pr_err log (Sai Teja)
>>> v5:
>>>    -Add error & debug logging for RC0 power and frequency checks 
>>> (Anshuman)
>>>
>>> Signed-off-by: Sk Anirban <sk.anirban@intel.com>
>>> Reviewed-by: Sai Teja Pottumuttu <sai.teja.pottumuttu@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/gt/selftest_rc6.c | 20 ++++++++++++++++----
>>>   1 file changed, 16 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c 
>>> b/drivers/gpu/drm/i915/gt/selftest_rc6.c
>>> index 1aa1446c8fb0..0cf86fed39ca 100644
>>> --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
>>> +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
>>> @@ -8,6 +8,7 @@
>>>   #include "intel_gpu_commands.h"
>>>   #include "intel_gt_requests.h"
>>>   #include "intel_ring.h"
>>> +#include "intel_rps.h"
>>>   #include "selftest_rc6.h"
>>>     #include "selftests/i915_random.h"
>>> @@ -38,6 +39,9 @@ int live_rc6_manual(void *arg)
>>>       ktime_t dt;
>>>       u64 res[2];
>>>       int err = 0;
>>> +    u32 rc0_freq = 0;
>>> +    u32 rc6_freq = 0;
>>> +    struct intel_rps *rps = &gt->rps;
>>>         /*
>>>        * Our claim is that we can "encourage" the GPU to enter rc6 
>>> at will.
>>> @@ -66,6 +70,7 @@ int live_rc6_manual(void *arg)
>>>       rc0_power = librapl_energy_uJ() - rc0_power;
>>>       dt = ktime_sub(ktime_get(), dt);
>>>       res[1] = rc6_residency(rc6);
>>> +    rc0_freq = intel_rps_read_actual_frequency(rps);
>>>       if ((res[1] - res[0]) >> 10) {
>>>           pr_err("RC6 residency increased by %lldus while disabled 
>>> for 1000ms!\n",
>>>                  (res[1] - res[0]) >> 10);
>>> @@ -77,9 +82,14 @@ int live_rc6_manual(void *arg)
>>>           rc0_power = div64_u64(NSEC_PER_SEC * rc0_power,
>>>                         ktime_to_ns(dt));
>>>           if (!rc0_power) {
>>> -            pr_err("No power measured while in RC0\n");
>>> -            err = -EINVAL;
>>> -            goto out_unlock;
>>> +            if (rc0_freq)
>>> +                pr_debug("No power measured while in RC0! GPU Freq: 
>>> %u in RC0\n",
>>> +            rc0_freq);
>> This line is not indented properly.
will fix this issue.
>>> +            else {
>>> +                pr_err("No power and freq measured while in RC0\n");
>>> +                err = -EINVAL;
>>> +                goto out_unlock;
>> AFAIU we should have the EINVAL and goto out_unlock irrespective of 
>> rc0_freq. Reason being, if rc0_power is not measured then the 
>> comparison we are doing with rc6_power below becomes invalid so I 
>> think we shouldn't proceed if rc0_power is not measured and just goto 
>> out_unlock with EINVAL. Am I missing something?
the main purpose was to print the message as debug_log if there is any 
issue with the power measurement, although we may include a check 
whether rc_0 power is there or not before the final validation of "(2 * 
rc6_power > rc0_power)". @Anshuman, please provide your inputs here.
>>> +            }
>>>           }
>>>       }
>>>   @@ -91,6 +101,7 @@ int live_rc6_manual(void *arg)
>>>       dt = ktime_get();
>>>       rc6_power = librapl_energy_uJ();
>>>       msleep(100);
>>> +    rc6_freq = intel_rps_read_actual_frequency(rps);
>>>       rc6_power = librapl_energy_uJ() - rc6_power;
>>>       dt = ktime_sub(ktime_get(), dt);
>>>       res[1] = rc6_residency(rc6);
>>> @@ -108,7 +119,8 @@ int live_rc6_manual(void *arg)
>>>           pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
>>>               rc0_power, rc6_power);
>>>           if (2 * rc6_power > rc0_power) {
>>> -            pr_err("GPU leaked energy while in RC6!\n");
>>> +            pr_err("GPU leaked energy while in RC6! GPU Freq: %u in 
>>> RC6 and %u in RC0\n",
>>> +                   rc6_freq, rc0_freq);
>>>               err = -EINVAL;
>>>               goto out_unlock;
>>>           }
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c
index 1aa1446c8fb0..0cf86fed39ca 100644
--- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
+++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
@@ -8,6 +8,7 @@ 
 #include "intel_gpu_commands.h"
 #include "intel_gt_requests.h"
 #include "intel_ring.h"
+#include "intel_rps.h"
 #include "selftest_rc6.h"
 
 #include "selftests/i915_random.h"
@@ -38,6 +39,9 @@  int live_rc6_manual(void *arg)
 	ktime_t dt;
 	u64 res[2];
 	int err = 0;
+	u32 rc0_freq = 0;
+	u32 rc6_freq = 0;
+	struct intel_rps *rps = &gt->rps;
 
 	/*
 	 * Our claim is that we can "encourage" the GPU to enter rc6 at will.
@@ -66,6 +70,7 @@  int live_rc6_manual(void *arg)
 	rc0_power = librapl_energy_uJ() - rc0_power;
 	dt = ktime_sub(ktime_get(), dt);
 	res[1] = rc6_residency(rc6);
+	rc0_freq = intel_rps_read_actual_frequency(rps);
 	if ((res[1] - res[0]) >> 10) {
 		pr_err("RC6 residency increased by %lldus while disabled for 1000ms!\n",
 		       (res[1] - res[0]) >> 10);
@@ -77,9 +82,14 @@  int live_rc6_manual(void *arg)
 		rc0_power = div64_u64(NSEC_PER_SEC * rc0_power,
 				      ktime_to_ns(dt));
 		if (!rc0_power) {
-			pr_err("No power measured while in RC0\n");
-			err = -EINVAL;
-			goto out_unlock;
+			if (rc0_freq)
+				pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n",
+			rc0_freq);
+			else {
+				pr_err("No power and freq measured while in RC0\n");
+				err = -EINVAL;
+				goto out_unlock;
+			}
 		}
 	}
 
@@ -91,6 +101,7 @@  int live_rc6_manual(void *arg)
 	dt = ktime_get();
 	rc6_power = librapl_energy_uJ();
 	msleep(100);
+	rc6_freq = intel_rps_read_actual_frequency(rps);
 	rc6_power = librapl_energy_uJ() - rc6_power;
 	dt = ktime_sub(ktime_get(), dt);
 	res[1] = rc6_residency(rc6);
@@ -108,7 +119,8 @@  int live_rc6_manual(void *arg)
 		pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
 			rc0_power, rc6_power);
 		if (2 * rc6_power > rc0_power) {
-			pr_err("GPU leaked energy while in RC6!\n");
+			pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n",
+			       rc6_freq, rc0_freq);
 			err = -EINVAL;
 			goto out_unlock;
 		}