Message ID | 20250306191110.2582025-1-sk.anirban@intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v3] drm/i915/selftests: Refactor RC6 power measurement and error handling | expand |
On 07-03-2025 00:41, sk.anirban@intel.com wrote: > From: Sk Anirban <sk.anirban@intel.com> > > Refactor power measurement logic to store and compare energy values. > Introduce a threshold check to ensure the GPU enters RC6 properly. > > v2: > - Improved commit message (Badal) > > v3: > - Reorder threshold check (Badal) > > Signed-off-by: Sk Anirban <sk.anirban@intel.com> > --- > drivers/gpu/drm/i915/gt/selftest_rc6.c | 59 +++++++++++++++++--------- > 1 file changed, 38 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c > index 908483ab0bc8..5364e50be638 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c > +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c > @@ -33,15 +33,20 @@ int live_rc6_manual(void *arg) > { > struct intel_gt *gt = arg; > struct intel_rc6 *rc6 = >->rc6; > - u64 rc0_power, rc6_power; > + struct intel_rps *rps = >->rps; > intel_wakeref_t wakeref; > + u64 sleep_time = 1000; > + u32 rc0_freq = 0; > + u32 rc6_freq = 0; > + u64 rc0_power[3]; > + u64 rc6_power[3]; > bool has_power; > + u64 threshold; > ktime_t dt; > u64 res[2]; > int err = 0; > - u32 rc0_freq = 0; > - u32 rc6_freq = 0; > - struct intel_rps *rps = >->rps; > + u64 diff; > + > > /* > * Our claim is that we can "encourage" the GPU to enter rc6 at will. > @@ -65,9 +70,9 @@ int live_rc6_manual(void *arg) > res[0] = rc6_residency(rc6); > > dt = ktime_get(); > - rc0_power = librapl_energy_uJ(); > - msleep(1000); > - rc0_power = librapl_energy_uJ() - rc0_power; > + rc0_power[0] = librapl_energy_uJ(); > + msleep(sleep_time); > + rc0_power[1] = librapl_energy_uJ() - rc0_power[0]; > dt = ktime_sub(ktime_get(), dt); > res[1] = rc6_residency(rc6); > rc0_freq = intel_rps_read_actual_frequency_fw(rps); > @@ -79,11 +84,12 @@ int live_rc6_manual(void *arg) > } > > if (has_power) { > - rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, > - ktime_to_ns(dt)); > - if (!rc0_power) { > + rc0_power[2] = div64_u64(NSEC_PER_SEC * rc0_power[1], > + ktime_to_ns(dt)); > + > + if (!rc0_power[2]) { > if (rc0_freq) > - pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n", > + pr_debug("No power measured while in RC0! GPU Freq: %uMHz in RC0\n", > rc0_freq); > else > pr_err("No power and freq measured while in RC0\n"); > @@ -98,10 +104,10 @@ int live_rc6_manual(void *arg) > res[0] = rc6_residency(rc6); > intel_uncore_forcewake_flush(rc6_to_uncore(rc6), FORCEWAKE_ALL); > dt = ktime_get(); > - rc6_power = librapl_energy_uJ(); > - msleep(1000); > + rc6_power[0] = librapl_energy_uJ(); > + msleep(sleep_time); > rc6_freq = intel_rps_read_actual_frequency_fw(rps); > - rc6_power = librapl_energy_uJ() - rc6_power; > + rc6_power[1] = librapl_energy_uJ() - rc6_power[0]; > dt = ktime_sub(ktime_get(), dt); > res[1] = rc6_residency(rc6); > if (res[1] == res[0]) { > @@ -113,13 +119,24 @@ int live_rc6_manual(void *arg) > } > > if (has_power) { > - rc6_power = div64_u64(NSEC_PER_SEC * rc6_power, > - ktime_to_ns(dt)); > - pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", > - rc0_power, rc6_power); > - if (2 * rc6_power > rc0_power) { > - pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n", > - rc6_freq, rc0_freq); > + rc6_power[2] = div64_u64(NSEC_PER_SEC * rc6_power[1], > + ktime_to_ns(dt)); > + pr_info("GPU consumed %lluuW in RC0 and %lluuW in RC6\n", > + rc0_power[2], rc6_power[2]); > + > + if (2 * rc6_power[2] > rc0_power[2]) { > + pr_err("GPU leaked energy while in RC6!\n" > + "GPU Freq: %uMHz in RC6 and %uMHz in RC0\n" > + "RC0 energy before & after sleep respectively: %lluuJ %lluuJ\n" > + "RC6 energy before & after sleep respectively: %lluuJ %lluuJ\n", > + rc6_freq, rc0_freq, rc0_power[0], rc0_power[1], > + rc6_power[0], rc6_power[1]); > + > + diff = res[1] - res[0]; > + threshold = (9 * NSEC_PER_MSEC * sleep_time) / 10; > + if (diff < threshold) > + pr_err("Did not enter RC6 properly, RC6 start residency=%lluns, RC6 end residency=%lluns\n", > + res[0], res[1]); Check if BAT failures reported are related. Similar errors were seen with other selftest related patches too. Otherwise this looks good to me. Reviewed-by: Badal Nilawar <badal.nilawar@intel.com> Regards, Badal > err = -EINVAL; > goto out_unlock; > }
On 11-03-2025 16:47, Nilawar, Badal wrote: > > On 07-03-2025 00:41, sk.anirban@intel.com wrote: >> From: Sk Anirban <sk.anirban@intel.com> >> >> Refactor power measurement logic to store and compare energy values. >> Introduce a threshold check to ensure the GPU enters RC6 properly. >> >> v2: >> - Improved commit message (Badal) >> >> v3: >> - Reorder threshold check (Badal) >> >> Signed-off-by: Sk Anirban <sk.anirban@intel.com> >> --- >> drivers/gpu/drm/i915/gt/selftest_rc6.c | 59 +++++++++++++++++--------- >> 1 file changed, 38 insertions(+), 21 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c >> b/drivers/gpu/drm/i915/gt/selftest_rc6.c >> index 908483ab0bc8..5364e50be638 100644 >> --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c >> +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c >> @@ -33,15 +33,20 @@ int live_rc6_manual(void *arg) >> { >> struct intel_gt *gt = arg; >> struct intel_rc6 *rc6 = >->rc6; >> - u64 rc0_power, rc6_power; >> + struct intel_rps *rps = >->rps; >> intel_wakeref_t wakeref; >> + u64 sleep_time = 1000; >> + u32 rc0_freq = 0; >> + u32 rc6_freq = 0; >> + u64 rc0_power[3]; >> + u64 rc6_power[3]; >> bool has_power; >> + u64 threshold; >> ktime_t dt; >> u64 res[2]; >> int err = 0; >> - u32 rc0_freq = 0; >> - u32 rc6_freq = 0; >> - struct intel_rps *rps = >->rps; >> + u64 diff; >> + >> /* >> * Our claim is that we can "encourage" the GPU to enter rc6 at >> will. >> @@ -65,9 +70,9 @@ int live_rc6_manual(void *arg) >> res[0] = rc6_residency(rc6); >> dt = ktime_get(); >> - rc0_power = librapl_energy_uJ(); >> - msleep(1000); >> - rc0_power = librapl_energy_uJ() - rc0_power; >> + rc0_power[0] = librapl_energy_uJ(); >> + msleep(sleep_time); >> + rc0_power[1] = librapl_energy_uJ() - rc0_power[0]; >> dt = ktime_sub(ktime_get(), dt); >> res[1] = rc6_residency(rc6); >> rc0_freq = intel_rps_read_actual_frequency_fw(rps); >> @@ -79,11 +84,12 @@ int live_rc6_manual(void *arg) >> } >> if (has_power) { >> - rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, >> - ktime_to_ns(dt)); >> - if (!rc0_power) { >> + rc0_power[2] = div64_u64(NSEC_PER_SEC * rc0_power[1], >> + ktime_to_ns(dt)); >> + >> + if (!rc0_power[2]) { >> if (rc0_freq) >> - pr_debug("No power measured while in RC0! GPU Freq: >> %u in RC0\n", >> + pr_debug("No power measured while in RC0! GPU Freq: >> %uMHz in RC0\n", >> rc0_freq); >> else >> pr_err("No power and freq measured while in RC0\n"); >> @@ -98,10 +104,10 @@ int live_rc6_manual(void *arg) >> res[0] = rc6_residency(rc6); >> intel_uncore_forcewake_flush(rc6_to_uncore(rc6), FORCEWAKE_ALL); >> dt = ktime_get(); >> - rc6_power = librapl_energy_uJ(); >> - msleep(1000); >> + rc6_power[0] = librapl_energy_uJ(); >> + msleep(sleep_time); >> rc6_freq = intel_rps_read_actual_frequency_fw(rps); >> - rc6_power = librapl_energy_uJ() - rc6_power; >> + rc6_power[1] = librapl_energy_uJ() - rc6_power[0]; >> dt = ktime_sub(ktime_get(), dt); >> res[1] = rc6_residency(rc6); >> if (res[1] == res[0]) { >> @@ -113,13 +119,24 @@ int live_rc6_manual(void *arg) >> } >> if (has_power) { >> - rc6_power = div64_u64(NSEC_PER_SEC * rc6_power, >> - ktime_to_ns(dt)); >> - pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", >> - rc0_power, rc6_power); >> - if (2 * rc6_power > rc0_power) { >> - pr_err("GPU leaked energy while in RC6! GPU Freq: %u in >> RC6 and %u in RC0\n", >> - rc6_freq, rc0_freq); >> + rc6_power[2] = div64_u64(NSEC_PER_SEC * rc6_power[1], >> + ktime_to_ns(dt)); >> + pr_info("GPU consumed %lluuW in RC0 and %lluuW in RC6\n", >> + rc0_power[2], rc6_power[2]); >> + >> + if (2 * rc6_power[2] > rc0_power[2]) { >> + pr_err("GPU leaked energy while in RC6!\n" >> + "GPU Freq: %uMHz in RC6 and %uMHz in RC0\n" >> + "RC0 energy before & after sleep respectively: >> %lluuJ %lluuJ\n" >> + "RC6 energy before & after sleep respectively: >> %lluuJ %lluuJ\n", >> + rc6_freq, rc0_freq, rc0_power[0], rc0_power[1], >> + rc6_power[0], rc6_power[1]); >> + >> + diff = res[1] - res[0]; >> + threshold = (9 * NSEC_PER_MSEC * sleep_time) / 10; >> + if (diff < threshold) >> + pr_err("Did not enter RC6 properly, RC6 start >> residency=%lluns, RC6 end residency=%lluns\n", >> + res[0], res[1]); > > Check if BAT failures reported are related. Similar errors were seen > with other selftest related patches too. > Otherwise this looks good to me. > > Reviewed-by: Badal Nilawar <badal.nilawar@intel.com> > > Regards, > Badal BAT failures were not related to these changes and re-reported. Thanks, Anirban > >> err = -EINVAL; >> goto out_unlock; >> }
diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c index 908483ab0bc8..5364e50be638 100644 --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c @@ -33,15 +33,20 @@ int live_rc6_manual(void *arg) { struct intel_gt *gt = arg; struct intel_rc6 *rc6 = >->rc6; - u64 rc0_power, rc6_power; + struct intel_rps *rps = >->rps; intel_wakeref_t wakeref; + u64 sleep_time = 1000; + u32 rc0_freq = 0; + u32 rc6_freq = 0; + u64 rc0_power[3]; + u64 rc6_power[3]; bool has_power; + u64 threshold; ktime_t dt; u64 res[2]; int err = 0; - u32 rc0_freq = 0; - u32 rc6_freq = 0; - struct intel_rps *rps = >->rps; + u64 diff; + /* * Our claim is that we can "encourage" the GPU to enter rc6 at will. @@ -65,9 +70,9 @@ int live_rc6_manual(void *arg) res[0] = rc6_residency(rc6); dt = ktime_get(); - rc0_power = librapl_energy_uJ(); - msleep(1000); - rc0_power = librapl_energy_uJ() - rc0_power; + rc0_power[0] = librapl_energy_uJ(); + msleep(sleep_time); + rc0_power[1] = librapl_energy_uJ() - rc0_power[0]; dt = ktime_sub(ktime_get(), dt); res[1] = rc6_residency(rc6); rc0_freq = intel_rps_read_actual_frequency_fw(rps); @@ -79,11 +84,12 @@ int live_rc6_manual(void *arg) } if (has_power) { - rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, - ktime_to_ns(dt)); - if (!rc0_power) { + rc0_power[2] = div64_u64(NSEC_PER_SEC * rc0_power[1], + ktime_to_ns(dt)); + + if (!rc0_power[2]) { if (rc0_freq) - pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n", + pr_debug("No power measured while in RC0! GPU Freq: %uMHz in RC0\n", rc0_freq); else pr_err("No power and freq measured while in RC0\n"); @@ -98,10 +104,10 @@ int live_rc6_manual(void *arg) res[0] = rc6_residency(rc6); intel_uncore_forcewake_flush(rc6_to_uncore(rc6), FORCEWAKE_ALL); dt = ktime_get(); - rc6_power = librapl_energy_uJ(); - msleep(1000); + rc6_power[0] = librapl_energy_uJ(); + msleep(sleep_time); rc6_freq = intel_rps_read_actual_frequency_fw(rps); - rc6_power = librapl_energy_uJ() - rc6_power; + rc6_power[1] = librapl_energy_uJ() - rc6_power[0]; dt = ktime_sub(ktime_get(), dt); res[1] = rc6_residency(rc6); if (res[1] == res[0]) { @@ -113,13 +119,24 @@ int live_rc6_manual(void *arg) } if (has_power) { - rc6_power = div64_u64(NSEC_PER_SEC * rc6_power, - ktime_to_ns(dt)); - pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", - rc0_power, rc6_power); - if (2 * rc6_power > rc0_power) { - pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n", - rc6_freq, rc0_freq); + rc6_power[2] = div64_u64(NSEC_PER_SEC * rc6_power[1], + ktime_to_ns(dt)); + pr_info("GPU consumed %lluuW in RC0 and %lluuW in RC6\n", + rc0_power[2], rc6_power[2]); + + if (2 * rc6_power[2] > rc0_power[2]) { + pr_err("GPU leaked energy while in RC6!\n" + "GPU Freq: %uMHz in RC6 and %uMHz in RC0\n" + "RC0 energy before & after sleep respectively: %lluuJ %lluuJ\n" + "RC6 energy before & after sleep respectively: %lluuJ %lluuJ\n", + rc6_freq, rc0_freq, rc0_power[0], rc0_power[1], + rc6_power[0], rc6_power[1]); + + diff = res[1] - res[0]; + threshold = (9 * NSEC_PER_MSEC * sleep_time) / 10; + if (diff < threshold) + pr_err("Did not enter RC6 properly, RC6 start residency=%lluns, RC6 end residency=%lluns\n", + res[0], res[1]); err = -EINVAL; goto out_unlock; }