Message ID | 20200324204455.2988-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915/selftests: Measure the energy consumed while in RC6 | expand |
Quoting Chris Wilson (2020-03-24 20:44:55) > + dt = ktime_get(); > + rc0_power = energy_uJ(rc6); > res[0] = rc6_residency(rc6); > msleep(250); > res[1] = rc6_residency(rc6); > + rc0_power = div64_u64(energy_uJ(rc6) - rc0_power, > + ktime_to_ns(ktime_sub(ktime_get(), dt))); Did you forget this was in ns? You did! -Chris
On Tue, Mar 24, 2020 at 1:45 PM Chris Wilson <chris@chris-wilson.co.uk> wrote: > > Measure and compare the energy consumed, as reported by the rapl MSR, > by the GPU while in RC0 and RC6 states. Throw an error if RC6 does not > at least halve the energy consumption of RC0, as this more than likely > means we failed to enter RC0 correctly. > > If we can't measure the energy draw with the MSR, then it will report 0 > for both measurements. Since the measurement works on all gen6+, this seems > worth flagging as an error. I'm confused by this statement here. MSR is a *CPU* register and you are using it here, mixed with RC6. How is that supposed to work with, e.g., dgfx? thanks Lucas De Marchi > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > Cc: Andi Shyti <andi.shyti@intel.com> > --- > drivers/gpu/drm/i915/gt/selftest_rc6.c | 39 ++++++++++++++++++++++++++ > 1 file changed, 39 insertions(+) > > diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c > index 95b165faeba7..3ac9a8925218 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c > +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c > @@ -12,6 +12,22 @@ > > #include "selftests/i915_random.h" > > +#define MCH_SECP_NRG_STTS _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x592c) > + > +static u64 energy_uJ(struct intel_rc6 *rc6) > +{ > + unsigned long long power; > + u32 units; > + > + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &power)) > + return 0; > + > + units = (power & 0x1f00) >> 8; > + power = intel_uncore_read_fw(rc6_to_uncore(rc6), MCH_SECP_NRG_STTS); > + > + return (1000000 * power) >> units; /* convert to uJ */ > +} > + > static u64 rc6_residency(struct intel_rc6 *rc6) > { > u64 result; > @@ -31,7 +47,9 @@ int live_rc6_manual(void *arg) > { > struct intel_gt *gt = arg; > struct intel_rc6 *rc6 = >->rc6; > + u64 rc0_power, rc6_power; > intel_wakeref_t wakeref; > + ktime_t dt; > u64 res[2]; > int err = 0; > > @@ -53,22 +71,35 @@ int live_rc6_manual(void *arg) > __intel_rc6_disable(rc6); > msleep(1); /* wakeup is not immediate, takes about 100us on icl */ > > + dt = ktime_get(); > + rc0_power = energy_uJ(rc6); > res[0] = rc6_residency(rc6); > msleep(250); > res[1] = rc6_residency(rc6); > + rc0_power = div64_u64(energy_uJ(rc6) - rc0_power, > + ktime_to_ns(ktime_sub(ktime_get(), dt))); > if ((res[1] - res[0]) >> 10) { > pr_err("RC6 residency increased by %lldus while disabled for 250ms!\n", > (res[1] - res[0]) >> 10); > err = -EINVAL; > goto out_unlock; > } > + if (!rc0_power) { > + pr_err("No power measured while in RC0\n"); > + err = -EINVAL; > + goto out_unlock; > + } > > /* Manually enter RC6 */ > intel_rc6_park(rc6); > > + dt = ktime_get(); > + rc6_power = energy_uJ(rc6); > res[0] = rc6_residency(rc6); > msleep(100); > res[1] = rc6_residency(rc6); > + rc6_power = div64_u64(energy_uJ(rc6) - rc6_power, > + ktime_to_ns(ktime_sub(ktime_get(), dt))); > > if (res[1] == res[0]) { > pr_err("Did not enter RC6! RC6_STATE=%08x, RC6_CONTROL=%08x, residency=%lld\n", > @@ -78,6 +109,14 @@ int live_rc6_manual(void *arg) > err = -EINVAL; > } > > + pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", > + rc0_power, rc6_power); > + if ((rc6_power >> 10) > (rc0_power >> 10) / 2) { /* compare mW */ > + pr_err("GPU leaked energy while in RC6!\n"); > + err = -EINVAL; > + goto out_unlock; > + } > + > /* Restore what should have been the original state! */ > intel_rc6_unpark(rc6); > > -- > 2.20.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Quoting Lucas De Marchi (2020-09-29 00:56:54) > On Tue, Mar 24, 2020 at 1:45 PM Chris Wilson <chris@chris-wilson.co.uk> wrote: > > > > Measure and compare the energy consumed, as reported by the rapl MSR, > > by the GPU while in RC0 and RC6 states. Throw an error if RC6 does not > > at least halve the energy consumption of RC0, as this more than likely > > means we failed to enter RC0 correctly. > > > > If we can't measure the energy draw with the MSR, then it will report 0 > > for both measurements. Since the measurement works on all gen6+, this seems > > worth flagging as an error. > > I'm confused by this statement here. MSR is a *CPU* register and you are using > it here, mixed with RC6. How is that supposed to work with, e.g., dgfx? You abstract it with the right interface for hwmon. The card reports energy draw, so the test remains the same, verify that a low power state does consume substantially less energy (and if we can get fine enough granularity that the GT powerwells draw 0). -Chris
diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c index 95b165faeba7..3ac9a8925218 100644 --- a/drivers/gpu/drm/i915/gt/selftest_rc6.c +++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c @@ -12,6 +12,22 @@ #include "selftests/i915_random.h" +#define MCH_SECP_NRG_STTS _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x592c) + +static u64 energy_uJ(struct intel_rc6 *rc6) +{ + unsigned long long power; + u32 units; + + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &power)) + return 0; + + units = (power & 0x1f00) >> 8; + power = intel_uncore_read_fw(rc6_to_uncore(rc6), MCH_SECP_NRG_STTS); + + return (1000000 * power) >> units; /* convert to uJ */ +} + static u64 rc6_residency(struct intel_rc6 *rc6) { u64 result; @@ -31,7 +47,9 @@ int live_rc6_manual(void *arg) { struct intel_gt *gt = arg; struct intel_rc6 *rc6 = >->rc6; + u64 rc0_power, rc6_power; intel_wakeref_t wakeref; + ktime_t dt; u64 res[2]; int err = 0; @@ -53,22 +71,35 @@ int live_rc6_manual(void *arg) __intel_rc6_disable(rc6); msleep(1); /* wakeup is not immediate, takes about 100us on icl */ + dt = ktime_get(); + rc0_power = energy_uJ(rc6); res[0] = rc6_residency(rc6); msleep(250); res[1] = rc6_residency(rc6); + rc0_power = div64_u64(energy_uJ(rc6) - rc0_power, + ktime_to_ns(ktime_sub(ktime_get(), dt))); if ((res[1] - res[0]) >> 10) { pr_err("RC6 residency increased by %lldus while disabled for 250ms!\n", (res[1] - res[0]) >> 10); err = -EINVAL; goto out_unlock; } + if (!rc0_power) { + pr_err("No power measured while in RC0\n"); + err = -EINVAL; + goto out_unlock; + } /* Manually enter RC6 */ intel_rc6_park(rc6); + dt = ktime_get(); + rc6_power = energy_uJ(rc6); res[0] = rc6_residency(rc6); msleep(100); res[1] = rc6_residency(rc6); + rc6_power = div64_u64(energy_uJ(rc6) - rc6_power, + ktime_to_ns(ktime_sub(ktime_get(), dt))); if (res[1] == res[0]) { pr_err("Did not enter RC6! RC6_STATE=%08x, RC6_CONTROL=%08x, residency=%lld\n", @@ -78,6 +109,14 @@ int live_rc6_manual(void *arg) err = -EINVAL; } + pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n", + rc0_power, rc6_power); + if ((rc6_power >> 10) > (rc0_power >> 10) / 2) { /* compare mW */ + pr_err("GPU leaked energy while in RC6!\n"); + err = -EINVAL; + goto out_unlock; + } + /* Restore what should have been the original state! */ intel_rc6_unpark(rc6);
Measure and compare the energy consumed, as reported by the rapl MSR, by the GPU while in RC0 and RC6 states. Throw an error if RC6 does not at least halve the energy consumption of RC0, as this more than likely means we failed to enter RC0 correctly. If we can't measure the energy draw with the MSR, then it will report 0 for both measurements. Since the measurement works on all gen6+, this seems worth flagging as an error. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Andi Shyti <andi.shyti@intel.com> --- drivers/gpu/drm/i915/gt/selftest_rc6.c | 39 ++++++++++++++++++++++++++ 1 file changed, 39 insertions(+)