diff mbox series

[3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies

Message ID 20211117224955.28999-4-vinay.belgaumkar@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/gt: RPS tuning for light media playback | expand

Commit Message

Vinay Belgaumkar Nov. 17, 2021, 10:49 p.m. UTC
From: Chris Wilson <chris@chris-wilson.co.uk>

While the power consumption is proportional to the frequency, there is
also a static draw for active gates. The longer we are able to powergate
(rc6), the lower the static draw. Thus there is a sweetspot in the
frequency/power curve where we run at higher frequency in order to sleep
longer, aka race-to-idle. This is more evident at lower frequencies, so
let's look to bump the frequency if we think we will benefit by sleeping
longer at the higher frequency and so conserving power.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 31 ++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

Comments

Rodrigo Vivi Nov. 22, 2021, 6:44 p.m. UTC | #1
On Wed, Nov 17, 2021 at 02:49:55PM -0800, Vinay Belgaumkar wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> While the power consumption is proportional to the frequency, there is
> also a static draw for active gates. The longer we are able to powergate
> (rc6), the lower the static draw. Thus there is a sweetspot in the
> frequency/power curve where we run at higher frequency in order to sleep
> longer, aka race-to-idle. This is more evident at lower frequencies, so
> let's look to bump the frequency if we think we will benefit by sleeping
> longer at the higher frequency and so conserving power.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>

Please let's not increase the complexity here, unless we have a very good
and documented reason.

Before trying to implement anything smart like this in the driver I'd like
to see data, power and performance results in different platforms and with
different workloads.

Thanks,
Rodrigo.

> ---
>  drivers/gpu/drm/i915/gt/intel_rps.c | 31 ++++++++++++++++++++++++-----
>  1 file changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
> index 3675ac93ded0..6af3231982af 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> @@ -63,6 +63,22 @@ static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
>  	intel_uncore_write_fw(uncore, reg, val);
>  }
>  
> +static bool race_to_idle(struct intel_rps *rps, u64 busy, u64 dt)
> +{
> +	unsigned int this = rps->cur_freq;
> +	unsigned int next = rps->cur_freq + 1;
> +	u64 next_dt = next * max(busy, dt);
> +
> +	/*
> +	 * Compare estimated time spent in rc6 at the next power bin. If
> +	 * we expect to sleep longer than the estimated increased power
> +	 * cost of running at a higher frequency, it will be reduced power
> +	 * consumption overall.
> +	 */
> +	return (((next_dt - this * busy) >> 10) * this * this >
> +		((next_dt - next * busy) >> 10) * next * next);
> +}
> +
>  static void rps_timer(struct timer_list *t)
>  {
>  	struct intel_rps *rps = from_timer(rps, t, timer);
> @@ -133,7 +149,7 @@ static void rps_timer(struct timer_list *t)
>  			if (!max_busy[i])
>  				break;
>  
> -			busy += div_u64(max_busy[i], 1 << i);
> +			busy += max_busy[i] >> i;
>  		}
>  		GT_TRACE(rps_to_gt(rps),
>  			 "busy:%lld [%d%%], max:[%lld, %lld, %lld], interval:%d\n",
> @@ -141,13 +157,18 @@ static void rps_timer(struct timer_list *t)
>  			 max_busy[0], max_busy[1], max_busy[2],
>  			 rps->pm_interval);
>  
> -		if (100 * busy > rps->power.up_threshold * dt &&
> -		    rps->cur_freq < rps->max_freq_softlimit) {
> +		if (rps->cur_freq < rps->max_freq_softlimit &&
> +		    race_to_idle(rps, max_busy[0], dt)) {
> +			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
> +			rps->pm_interval = 1;
> +			schedule_work(&rps->work);
> +		} else if (rps->cur_freq < rps->max_freq_softlimit &&
> +			   100 * busy > rps->power.up_threshold * dt) {
>  			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
>  			rps->pm_interval = 1;
>  			schedule_work(&rps->work);
> -		} else if (100 * busy < rps->power.down_threshold * dt &&
> -			   rps->cur_freq > rps->min_freq_softlimit) {
> +		} else if (rps->cur_freq > rps->min_freq_softlimit &&
> +			   100 * busy < rps->power.down_threshold * dt) {
>  			rps->pm_iir |= GEN6_PM_RP_DOWN_THRESHOLD;
>  			rps->pm_interval = 1;
>  			schedule_work(&rps->work);
> -- 
> 2.34.0
>
Tvrtko Ursulin Nov. 23, 2021, 9:17 a.m. UTC | #2
On 22/11/2021 18:44, Rodrigo Vivi wrote:
> On Wed, Nov 17, 2021 at 02:49:55PM -0800, Vinay Belgaumkar wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> While the power consumption is proportional to the frequency, there is
>> also a static draw for active gates. The longer we are able to powergate
>> (rc6), the lower the static draw. Thus there is a sweetspot in the
>> frequency/power curve where we run at higher frequency in order to sleep
>> longer, aka race-to-idle. This is more evident at lower frequencies, so
>> let's look to bump the frequency if we think we will benefit by sleeping
>> longer at the higher frequency and so conserving power.
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> 
> Please let's not increase the complexity here, unless we have a very good
> and documented reason.
> 
> Before trying to implement anything smart like this in the driver I'd like
> to see data, power and performance results in different platforms and with
> different workloads.

Who has such test suite and test farm which isn't focused to workloads 
from a single customer? ;(

Regards,

Tvrtko
Rodrigo Vivi Nov. 23, 2021, 4:53 p.m. UTC | #3
On Tue, 2021-11-23 at 09:17 +0000, Tvrtko Ursulin wrote:
> 
> On 22/11/2021 18:44, Rodrigo Vivi wrote:
> > On Wed, Nov 17, 2021 at 02:49:55PM -0800, Vinay Belgaumkar wrote:
> > > From: Chris Wilson <chris@chris-wilson.co.uk>
> > > 
> > > While the power consumption is proportional to the frequency,
> > > there is
> > > also a static draw for active gates. The longer we are able to
> > > powergate
> > > (rc6), the lower the static draw. Thus there is a sweetspot in
> > > the
> > > frequency/power curve where we run at higher frequency in order
> > > to sleep
> > > longer, aka race-to-idle. This is more evident at lower
> > > frequencies, so
> > > let's look to bump the frequency if we think we will benefit by
> > > sleeping
> > > longer at the higher frequency and so conserving power.
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > 
> > Please let's not increase the complexity here, unless we have a
> > very good
> > and documented reason.
> > 
> > Before trying to implement anything smart like this in the driver
> > I'd like
> > to see data, power and performance results in different platforms
> > and with
> > different workloads.
> 
> Who has such test suite and test farm which isn't focused to
> workloads 
> from a single customer? ;(

Okay, maybe we don't need to cover the world here. But without seen any
data at all it is hard to make this call.

> 
> Regards,
> 
> Tvrtko
Vinay Belgaumkar Nov. 23, 2021, 5:37 p.m. UTC | #4
On 11/17/2021 2:49 PM, Vinay Belgaumkar wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> While the power consumption is proportional to the frequency, there is
> also a static draw for active gates. The longer we are able to powergate
> (rc6), the lower the static draw. Thus there is a sweetspot in the
> frequency/power curve where we run at higher frequency in order to sleep
> longer, aka race-to-idle. This is more evident at lower frequencies, so
> let's look to bump the frequency if we think we will benefit by sleeping
> longer at the higher frequency and so conserving power.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>

Data collected does show some power savings.

Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_rps.c | 31 ++++++++++++++++++++++++-----
>   1 file changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
> index 3675ac93ded0..6af3231982af 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> @@ -63,6 +63,22 @@ static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
>   	intel_uncore_write_fw(uncore, reg, val);
>   }
>   
> +static bool race_to_idle(struct intel_rps *rps, u64 busy, u64 dt)
> +{
> +	unsigned int this = rps->cur_freq;
> +	unsigned int next = rps->cur_freq + 1;
> +	u64 next_dt = next * max(busy, dt);
> +
> +	/*
> +	 * Compare estimated time spent in rc6 at the next power bin. If
> +	 * we expect to sleep longer than the estimated increased power
> +	 * cost of running at a higher frequency, it will be reduced power
> +	 * consumption overall.
> +	 */
> +	return (((next_dt - this * busy) >> 10) * this * this >
> +		((next_dt - next * busy) >> 10) * next * next);
> +}
> +
>   static void rps_timer(struct timer_list *t)
>   {
>   	struct intel_rps *rps = from_timer(rps, t, timer);
> @@ -133,7 +149,7 @@ static void rps_timer(struct timer_list *t)
>   			if (!max_busy[i])
>   				break;
>   
> -			busy += div_u64(max_busy[i], 1 << i);
> +			busy += max_busy[i] >> i;
>   		}
>   		GT_TRACE(rps_to_gt(rps),
>   			 "busy:%lld [%d%%], max:[%lld, %lld, %lld], interval:%d\n",
> @@ -141,13 +157,18 @@ static void rps_timer(struct timer_list *t)
>   			 max_busy[0], max_busy[1], max_busy[2],
>   			 rps->pm_interval);
>   
> -		if (100 * busy > rps->power.up_threshold * dt &&
> -		    rps->cur_freq < rps->max_freq_softlimit) {
> +		if (rps->cur_freq < rps->max_freq_softlimit &&
> +		    race_to_idle(rps, max_busy[0], dt)) {
> +			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
> +			rps->pm_interval = 1;
> +			schedule_work(&rps->work);
> +		} else if (rps->cur_freq < rps->max_freq_softlimit &&
> +			   100 * busy > rps->power.up_threshold * dt) {
>   			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
>   			rps->pm_interval = 1;
>   			schedule_work(&rps->work);
> -		} else if (100 * busy < rps->power.down_threshold * dt &&
> -			   rps->cur_freq > rps->min_freq_softlimit) {
> +		} else if (rps->cur_freq > rps->min_freq_softlimit &&
> +			   100 * busy < rps->power.down_threshold * dt) {
>   			rps->pm_iir |= GEN6_PM_RP_DOWN_THRESHOLD;
>   			rps->pm_interval = 1;
>   			schedule_work(&rps->work);
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 3675ac93ded0..6af3231982af 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -63,6 +63,22 @@  static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
 	intel_uncore_write_fw(uncore, reg, val);
 }
 
+static bool race_to_idle(struct intel_rps *rps, u64 busy, u64 dt)
+{
+	unsigned int this = rps->cur_freq;
+	unsigned int next = rps->cur_freq + 1;
+	u64 next_dt = next * max(busy, dt);
+
+	/*
+	 * Compare estimated time spent in rc6 at the next power bin. If
+	 * we expect to sleep longer than the estimated increased power
+	 * cost of running at a higher frequency, it will be reduced power
+	 * consumption overall.
+	 */
+	return (((next_dt - this * busy) >> 10) * this * this >
+		((next_dt - next * busy) >> 10) * next * next);
+}
+
 static void rps_timer(struct timer_list *t)
 {
 	struct intel_rps *rps = from_timer(rps, t, timer);
@@ -133,7 +149,7 @@  static void rps_timer(struct timer_list *t)
 			if (!max_busy[i])
 				break;
 
-			busy += div_u64(max_busy[i], 1 << i);
+			busy += max_busy[i] >> i;
 		}
 		GT_TRACE(rps_to_gt(rps),
 			 "busy:%lld [%d%%], max:[%lld, %lld, %lld], interval:%d\n",
@@ -141,13 +157,18 @@  static void rps_timer(struct timer_list *t)
 			 max_busy[0], max_busy[1], max_busy[2],
 			 rps->pm_interval);
 
-		if (100 * busy > rps->power.up_threshold * dt &&
-		    rps->cur_freq < rps->max_freq_softlimit) {
+		if (rps->cur_freq < rps->max_freq_softlimit &&
+		    race_to_idle(rps, max_busy[0], dt)) {
+			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
+			rps->pm_interval = 1;
+			schedule_work(&rps->work);
+		} else if (rps->cur_freq < rps->max_freq_softlimit &&
+			   100 * busy > rps->power.up_threshold * dt) {
 			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
 			rps->pm_interval = 1;
 			schedule_work(&rps->work);
-		} else if (100 * busy < rps->power.down_threshold * dt &&
-			   rps->cur_freq > rps->min_freq_softlimit) {
+		} else if (rps->cur_freq > rps->min_freq_softlimit &&
+			   100 * busy < rps->power.down_threshold * dt) {
 			rps->pm_iir |= GEN6_PM_RP_DOWN_THRESHOLD;
 			rps->pm_interval = 1;
 			schedule_work(&rps->work);