diff mbox series

drm/i915/selftests: Measure the energy consumed while in RC6

Message ID 20200325081056.23003-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915/selftests: Measure the energy consumed while in RC6 | expand

Commit Message

Chris Wilson March 25, 2020, 8:10 a.m. UTC
Measure and compare the energy consumed, as reported by the rapl MSR,
by the GPU while in RC0 and RC6 states. Throw an error if RC6 does not
at least halve the energy consumption of RC0, as this more than likely
means we failed to enter RC0 correctly.

If we can't measure the energy draw with the MSR, then it will report 0
for both measurements. Since the measurement works on all gen6+, this seems
worth flagging as an error.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Andi Shyti <andi.shyti@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_rc6.c | 43 +++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

Comments

Andi Shyti March 25, 2020, 8:58 a.m. UTC | #1
Hi Chris,

On Wed, Mar 25, 2020 at 08:10:56AM +0000, Chris Wilson wrote:
> Measure and compare the energy consumed, as reported by the rapl MSR,
> by the GPU while in RC0 and RC6 states. Throw an error if RC6 does not
> at least halve the energy consumption of RC0, as this more than likely
> means we failed to enter RC0 correctly.
> 
> If we can't measure the energy draw with the MSR, then it will report 0
> for both measurements. Since the measurement works on all gen6+, this seems
> worth flagging as an error.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Andi Shyti <andi.shyti@intel.com>

would be nice to have a revision history, given that I got quite 
some versions of this patch.

> +static u64 energy_uJ(struct intel_rc6 *rc6)
> +{
> +	unsigned long long power;
> +	u32 units;
> +
> +	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &power))
> +		return 0;
> +
> +	units = (power & 0x1f00) >> 8;
> +
> +	if (rdmsrl_safe(MSR_PP1_ENERGY_STATUS, &power))
> +		return 0;
> +
> +	return (1000000 * power) >> units; /* convert to uJ */
> +}

shall we put this in a library?

>  	res[0] = rc6_residency(rc6);
> +	dt = ktime_get();
> +	rc0_power = energy_uJ(rc6);
>  	msleep(250);
> +	rc0_power = energy_uJ(rc6) - rc0_power;
> +	dt = ktime_sub(ktime_get(), dt);
>  	res[1] = rc6_residency(rc6);
>  	if ((res[1] - res[0]) >> 10) {
>  		pr_err("RC6 residency increased by %lldus while disabled for 250ms!\n",
> @@ -63,13 +85,23 @@ int live_rc6_manual(void *arg)
>  		goto out_unlock;
>  	}
>  
> +	rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, ktime_to_ns(dt));
> +	if (!rc0_power) {

is this likely to happen?

>  	res[0] = rc6_residency(rc6);
> +	dt = ktime_get();
> +	rc6_power = energy_uJ(rc6);
>  	msleep(100);
> +	rc6_power = energy_uJ(rc6) - rc6_power;
> +	dt = ktime_sub(ktime_get(), dt);
>  	res[1] = rc6_residency(rc6);
> -
>  	if (res[1] == res[0]) {
>  		pr_err("Did not enter RC6! RC6_STATE=%08x, RC6_CONTROL=%08x, residency=%lld\n",
>  		       intel_uncore_read_fw(gt->uncore, GEN6_RC_STATE),
> @@ -78,6 +110,15 @@ int live_rc6_manual(void *arg)
>  		err = -EINVAL;
>  	}
>  
> +	rc6_power = div64_u64(NSEC_PER_SEC * rc6_power, ktime_to_ns(dt));
> +	pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
> +		rc0_power, rc6_power);
> +	if (2 * rc6_power > rc0_power) {
> +		pr_err("GPU leaked energy while in RC6!\n");
> +		err = -EINVAL;
> +		goto out_unlock;
> +	}

nice,

Reviewed-by: Andi Shyti <andi.shyti@intel.com>

Thanks,
Andi
Chris Wilson March 25, 2020, 9:10 a.m. UTC | #2
Quoting Andi Shyti (2020-03-25 08:58:54)
> Hi Chris,
> 
> On Wed, Mar 25, 2020 at 08:10:56AM +0000, Chris Wilson wrote:
> > Measure and compare the energy consumed, as reported by the rapl MSR,
> > by the GPU while in RC0 and RC6 states. Throw an error if RC6 does not
> > at least halve the energy consumption of RC0, as this more than likely
> > means we failed to enter RC0 correctly.
> > 
> > If we can't measure the energy draw with the MSR, then it will report 0
> > for both measurements. Since the measurement works on all gen6+, this seems
> > worth flagging as an error.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Andi Shyti <andi.shyti@intel.com>
> 
> would be nice to have a revision history, given that I got quite 
> some versions of this patch.

Nothing that interesting happened, I told myself.

> > +static u64 energy_uJ(struct intel_rc6 *rc6)
> > +{
> > +     unsigned long long power;
> > +     u32 units;
> > +
> > +     if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &power))
> > +             return 0;
> > +
> > +     units = (power & 0x1f00) >> 8;
> > +
> > +     if (rdmsrl_safe(MSR_PP1_ENERGY_STATUS, &power))
> > +             return 0;
> > +
> > +     return (1000000 * power) >> units; /* convert to uJ */
> > +}
> 
> shall we put this in a library?

Call it rapl and make it available via perf? Done.

More seriously outside of measuring idle power usage, I haven't had an
idea where it makes sense. As an optimisation metric, you want work done
per joule, but we have no concept of the user's work in the kernel.
Other things like "operating point power" (the cost of running at a
particular frequency) are mostly constant and not tunable.

> >       res[0] = rc6_residency(rc6);
> > +     dt = ktime_get();
> > +     rc0_power = energy_uJ(rc6);
> >       msleep(250);
> > +     rc0_power = energy_uJ(rc6) - rc0_power;
> > +     dt = ktime_sub(ktime_get(), dt);
> >       res[1] = rc6_residency(rc6);
> >       if ((res[1] - res[0]) >> 10) {
> >               pr_err("RC6 residency increased by %lldus while disabled for 250ms!\n",
> > @@ -63,13 +85,23 @@ int live_rc6_manual(void *arg)
> >               goto out_unlock;
> >       }
> >  
> > +     rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, ktime_to_ns(dt));
> > +     if (!rc0_power) {
> 
> is this likely to happen?

Likely? Only if rapl is unable to measure the GPU energy consumption. So
no, it's not likely, unless you load the guc firmware on icl!
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c
index 95b165faeba7..48f8901d83e8 100644
--- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
+++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
@@ -12,6 +12,22 @@ 
 
 #include "selftests/i915_random.h"
 
+static u64 energy_uJ(struct intel_rc6 *rc6)
+{
+	unsigned long long power;
+	u32 units;
+
+	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &power))
+		return 0;
+
+	units = (power & 0x1f00) >> 8;
+
+	if (rdmsrl_safe(MSR_PP1_ENERGY_STATUS, &power))
+		return 0;
+
+	return (1000000 * power) >> units; /* convert to uJ */
+}
+
 static u64 rc6_residency(struct intel_rc6 *rc6)
 {
 	u64 result;
@@ -31,7 +47,9 @@  int live_rc6_manual(void *arg)
 {
 	struct intel_gt *gt = arg;
 	struct intel_rc6 *rc6 = &gt->rc6;
+	u64 rc0_power, rc6_power;
 	intel_wakeref_t wakeref;
+	ktime_t dt;
 	u64 res[2];
 	int err = 0;
 
@@ -54,7 +72,11 @@  int live_rc6_manual(void *arg)
 	msleep(1); /* wakeup is not immediate, takes about 100us on icl */
 
 	res[0] = rc6_residency(rc6);
+	dt = ktime_get();
+	rc0_power = energy_uJ(rc6);
 	msleep(250);
+	rc0_power = energy_uJ(rc6) - rc0_power;
+	dt = ktime_sub(ktime_get(), dt);
 	res[1] = rc6_residency(rc6);
 	if ((res[1] - res[0]) >> 10) {
 		pr_err("RC6 residency increased by %lldus while disabled for 250ms!\n",
@@ -63,13 +85,23 @@  int live_rc6_manual(void *arg)
 		goto out_unlock;
 	}
 
+	rc0_power = div64_u64(NSEC_PER_SEC * rc0_power, ktime_to_ns(dt));
+	if (!rc0_power) {
+		pr_err("No power measured while in RC0\n");
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
 	/* Manually enter RC6 */
 	intel_rc6_park(rc6);
 
 	res[0] = rc6_residency(rc6);
+	dt = ktime_get();
+	rc6_power = energy_uJ(rc6);
 	msleep(100);
+	rc6_power = energy_uJ(rc6) - rc6_power;
+	dt = ktime_sub(ktime_get(), dt);
 	res[1] = rc6_residency(rc6);
-
 	if (res[1] == res[0]) {
 		pr_err("Did not enter RC6! RC6_STATE=%08x, RC6_CONTROL=%08x, residency=%lld\n",
 		       intel_uncore_read_fw(gt->uncore, GEN6_RC_STATE),
@@ -78,6 +110,15 @@  int live_rc6_manual(void *arg)
 		err = -EINVAL;
 	}
 
+	rc6_power = div64_u64(NSEC_PER_SEC * rc6_power, ktime_to_ns(dt));
+	pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
+		rc0_power, rc6_power);
+	if (2 * rc6_power > rc0_power) {
+		pr_err("GPU leaked energy while in RC6!\n");
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
 	/* Restore what should have been the original state! */
 	intel_rc6_unpark(rc6);