Message ID | 20241010173439.2006496-1-sk.anirban@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v5] drm/i915/selftests: Implement frequency logging for energy reading validation | expand |
On 10-10-2024 23:04, Sk Anirban wrote: > Introduce RC6 & RC0 frequency logging mechanism to ensure accurate > energy readings aimed at addressing GPU energy leaks and power > measurement failures. > This enhancement will help ensure the accuracy of energy readings. > > v2: > - Improved commit message. > v3: > - Used pr_err log to display frequency (Anshuman) > - Sorted headers alphabetically (Sai Teja) > v4: > - Improved commit message > - Fix pr_err log (Sai Teja) > v5: > -Add error & debug logging for RC0 power and frequency checks (Anshuman) > > Signed-off-by: Sk Anirban <sk.anirban@intel.com> > Reviewed-by: Sai Teja Pottumuttu <sai.teja.pottumuttu@intel.com> > --- > drivers/gpu/drm/i915/gt/selftest_rc6.c | 20 ++++++++++++++++---- > 1 file changed, 16 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c > index 1aa1446c8fb0..0cf86fed39ca 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c > +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c > @@ -8,6 +8,7 @@ > #include "intel_gpu_commands.h" > #include "intel_gt_requests.h" > #include "intel_ring.h" > +#include "intel_rps.h" > #include "selftest_rc6.h" > > #include "selftests/i915_random.h" > @@ -38,6 +39,9 @@ int live_rc6_manual(void *arg) > ktime_t dt; > u64 res[2]; > int err = 0; > + u32 rc0_freq = 0; > + u32 rc6_freq = 0; > + struct intel_rps *rps = >->rps; > > /* > * Our claim is that we can "encourage" the GPU to enter rc6 at will. > @@ -66,6 +70,7 @@ int live_rc6_manual(void *arg) > rc0_power = librapl_energy_uJ() - rc0_power; > dt = ktime_sub(ktime_get(), dt); > res[1] = rc6_residency(rc6); > + rc0_freq = intel_rps_read_actual_frequency(rps); > if ((res[1] - res[0]) >> 10) { > pr_err("RC6 residency increased by %lldus while disabled for 1000ms!\n", > (res[1] - res[0]) >> 10); > @@ -77,9 +82,14 @@ int live_rc6_manual(void *arg) > rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, > ktime_to_ns(dt)); > if (!rc0_power) { > - pr_err("No power measured while in RC0\n"); > - err = -EINVAL; > - goto out_unlock; > + if (rc0_freq) > + pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n", > + rc0_freq); This line is not indented properly. > + else { > + pr_err("No power and freq measured while in RC0\n"); > + err = -EINVAL; > + goto out_unlock; AFAIU we should have the EINVAL and goto out_unlock irrespective of rc0_freq. Reason being, if rc0_power is not measured then the comparison we are doing with rc6_power below becomes invalid so I think we shouldn't proceed if rc0_power is not measured and just goto out_unlock with EINVAL. Am I missing something? > + } > } > } > > @@ -91,6 +101,7 @@ int live_rc6_manual(void *arg) > dt = ktime_get(); > rc6_power = librapl_energy_uJ(); > msleep(100); > + rc6_freq = intel_rps_read_actual_frequency(rps); > rc6_power = librapl_energy_uJ() - rc6_power; > dt = ktime_sub(ktime_get(), dt); > res[1] = rc6_residency(rc6); > @@ -108,7 +119,8 @@ int live_rc6_manual(void *arg) > pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", > rc0_power, rc6_power); > if (2 * rc6_power > rc0_power) { > - pr_err("GPU leaked energy while in RC6!\n"); > + pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n", > + rc6_freq, rc0_freq); > err = -EINVAL; > goto out_unlock; > }
On 11-10-2024 11:52, Anirban, Sk wrote: > > > On 10-10-2024 23:45, Pottumuttu, Sai Teja wrote: >> >> On 10-10-2024 23:04, Sk Anirban wrote: >>> Introduce RC6 & RC0 frequency logging mechanism to ensure accurate >>> energy readings aimed at addressing GPU energy leaks and power >>> measurement failures. >>> This enhancement will help ensure the accuracy of energy readings. >>> >>> v2: >>> - Improved commit message. >>> v3: >>> - Used pr_err log to display frequency (Anshuman) >>> - Sorted headers alphabetically (Sai Teja) >>> v4: >>> - Improved commit message >>> - Fix pr_err log (Sai Teja) >>> v5: >>> -Add error & debug logging for RC0 power and frequency checks >>> (Anshuman) >>> >>> Signed-off-by: Sk Anirban <sk.anirban@intel.com> >>> Reviewed-by: Sai Teja Pottumuttu <sai.teja.pottumuttu@intel.com> >>> --- >>> drivers/gpu/drm/i915/gt/selftest_rc6.c | 20 ++++++++++++++++---- >>> 1 file changed, 16 insertions(+), 4 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c >>> b/drivers/gpu/drm/i915/gt/selftest_rc6.c >>> index 1aa1446c8fb0..0cf86fed39ca 100644 >>> --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c >>> +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c >>> @@ -8,6 +8,7 @@ >>> #include "intel_gpu_commands.h" >>> #include "intel_gt_requests.h" >>> #include "intel_ring.h" >>> +#include "intel_rps.h" >>> #include "selftest_rc6.h" >>> #include "selftests/i915_random.h" >>> @@ -38,6 +39,9 @@ int live_rc6_manual(void *arg) >>> ktime_t dt; >>> u64 res[2]; >>> int err = 0; >>> + u32 rc0_freq = 0; >>> + u32 rc6_freq = 0; >>> + struct intel_rps *rps = >->rps; >>> /* >>> * Our claim is that we can "encourage" the GPU to enter rc6 >>> at will. >>> @@ -66,6 +70,7 @@ int live_rc6_manual(void *arg) >>> rc0_power = librapl_energy_uJ() - rc0_power; >>> dt = ktime_sub(ktime_get(), dt); >>> res[1] = rc6_residency(rc6); >>> + rc0_freq = intel_rps_read_actual_frequency(rps); >>> if ((res[1] - res[0]) >> 10) { >>> pr_err("RC6 residency increased by %lldus while disabled >>> for 1000ms!\n", >>> (res[1] - res[0]) >> 10); >>> @@ -77,9 +82,14 @@ int live_rc6_manual(void *arg) >>> rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, >>> ktime_to_ns(dt)); >>> if (!rc0_power) { >>> - pr_err("No power measured while in RC0\n"); >>> - err = -EINVAL; >>> - goto out_unlock; >>> + if (rc0_freq) >>> + pr_debug("No power measured while in RC0! GPU Freq: >>> %u in RC0\n", >>> + rc0_freq); >> This line is not indented properly. will fix this issue. >>> + else { >>> + pr_err("No power and freq measured while in RC0\n"); >>> + err = -EINVAL; >>> + goto out_unlock; >> AFAIU we should have the EINVAL and goto out_unlock irrespective of >> rc0_freq. Reason being, if rc0_power is not measured then the >> comparison we are doing with rc6_power below becomes invalid so I >> think we shouldn't proceed if rc0_power is not measured and just goto >> out_unlock with EINVAL. Am I missing something? the main purpose was to print the message as debug_log if there is any issue with the power measurement, although we may include a check whether rc_0 power is there or not before the final validation of "(2 * rc6_power > rc0_power)". @Anshuman, please provide your inputs here. >>> + } >>> } >>> } >>> @@ -91,6 +101,7 @@ int live_rc6_manual(void *arg) >>> dt = ktime_get(); >>> rc6_power = librapl_energy_uJ(); >>> msleep(100); >>> + rc6_freq = intel_rps_read_actual_frequency(rps); >>> rc6_power = librapl_energy_uJ() - rc6_power; >>> dt = ktime_sub(ktime_get(), dt); >>> res[1] = rc6_residency(rc6); >>> @@ -108,7 +119,8 @@ int live_rc6_manual(void *arg) >>> pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", >>> rc0_power, rc6_power); >>> if (2 * rc6_power > rc0_power) { >>> - pr_err("GPU leaked energy while in RC6!\n"); >>> + pr_err("GPU leaked energy while in RC6! GPU Freq: %u in >>> RC6 and %u in RC0\n", >>> + rc6_freq, rc0_freq); >>> err = -EINVAL; >>> goto out_unlock; >>> }
diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c index 1aa1446c8fb0..0cf86fed39ca 100644 --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c @@ -8,6 +8,7 @@ #include "intel_gpu_commands.h" #include "intel_gt_requests.h" #include "intel_ring.h" +#include "intel_rps.h" #include "selftest_rc6.h" #include "selftests/i915_random.h" @@ -38,6 +39,9 @@ int live_rc6_manual(void *arg) ktime_t dt; u64 res[2]; int err = 0; + u32 rc0_freq = 0; + u32 rc6_freq = 0; + struct intel_rps *rps = >->rps; /* * Our claim is that we can "encourage" the GPU to enter rc6 at will. @@ -66,6 +70,7 @@ int live_rc6_manual(void *arg) rc0_power = librapl_energy_uJ() - rc0_power; dt = ktime_sub(ktime_get(), dt); res[1] = rc6_residency(rc6); + rc0_freq = intel_rps_read_actual_frequency(rps); if ((res[1] - res[0]) >> 10) { pr_err("RC6 residency increased by %lldus while disabled for 1000ms!\n", (res[1] - res[0]) >> 10); @@ -77,9 +82,14 @@ int live_rc6_manual(void *arg) rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, ktime_to_ns(dt)); if (!rc0_power) { - pr_err("No power measured while in RC0\n"); - err = -EINVAL; - goto out_unlock; + if (rc0_freq) + pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n", + rc0_freq); + else { + pr_err("No power and freq measured while in RC0\n"); + err = -EINVAL; + goto out_unlock; + } } } @@ -91,6 +101,7 @@ int live_rc6_manual(void *arg) dt = ktime_get(); rc6_power = librapl_energy_uJ(); msleep(100); + rc6_freq = intel_rps_read_actual_frequency(rps); rc6_power = librapl_energy_uJ() - rc6_power; dt = ktime_sub(ktime_get(), dt); res[1] = rc6_residency(rc6); @@ -108,7 +119,8 @@ int live_rc6_manual(void *arg) pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", rc0_power, rc6_power); if (2 * rc6_power > rc0_power) { - pr_err("GPU leaked energy while in RC6!\n"); + pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n", + rc6_freq, rc0_freq); err = -EINVAL; goto out_unlock; }