diff mbox series

[2/3] drm/i915/gt: Compare average group occupancy for RPS evaluation

Message ID 20211117224955.28999-3-vinay.belgaumkar@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915/gt: RPS tuning for light media playback | expand

Commit Message

Vinay Belgaumkar Nov. 17, 2021, 10:49 p.m. UTC
From: Chris Wilson <chris.p.wilson@intel.com>

Currently, we inspect each engine individually and measure the occupancy
of that engine over the last evaluation interval. If that exceeds our
busyness thresholds, we decide to increase the GPU frequency. However,
under a load balancer, we should consider the occupancy of entire engine
groups, as work may be spread out across the group. In doing so, we
prefer wide over fast, power consumption is approximately proportional to
the square of the frequency. However, since the load balancer is greedy,
the first idle engine gets all the work, and preferrentially reuses the
last active engine, under light loads all work is assigned to one
engine, and so that engine appears very busy. But if the work happened
to overlap slightly, the workload would spread across multiple engines,
reducing each individual engine's runtime, and so reducing the rps
contribution, keeping the frequency low. Instead, when considering the
contribution, consider the contribution over the entire engine group
(capacity).

Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 48 ++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 14 deletions(-)

Comments

Vinay Belgaumkar Nov. 23, 2021, 5:35 p.m. UTC | #1
On 11/17/2021 2:49 PM, Vinay Belgaumkar wrote:
> From: Chris Wilson <chris.p.wilson@intel.com>
> 
> Currently, we inspect each engine individually and measure the occupancy
> of that engine over the last evaluation interval. If that exceeds our
> busyness thresholds, we decide to increase the GPU frequency. However,
> under a load balancer, we should consider the occupancy of entire engine
> groups, as work may be spread out across the group. In doing so, we
> prefer wide over fast, power consumption is approximately proportional to
> the square of the frequency. However, since the load balancer is greedy,
> the first idle engine gets all the work, and preferrentially reuses the
> last active engine, under light loads all work is assigned to one
> engine, and so that engine appears very busy. But if the work happened
> to overlap slightly, the workload would spread across multiple engines,
> reducing each individual engine's runtime, and so reducing the rps
> contribution, keeping the frequency low. Instead, when considering the
> contribution, consider the contribution over the entire engine group
> (capacity).
> 
> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>

Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>

> ---
>   drivers/gpu/drm/i915/gt/intel_rps.c | 48 ++++++++++++++++++++---------
>   1 file changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
> index 07ff7ba7b2b7..3675ac93ded0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> @@ -7,6 +7,7 @@
>   
>   #include "i915_drv.h"
>   #include "intel_breadcrumbs.h"
> +#include "intel_engine_pm.h"
>   #include "intel_gt.h"
>   #include "intel_gt_clock_utils.h"
>   #include "intel_gt_irq.h"
> @@ -65,26 +66,45 @@ static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
>   static void rps_timer(struct timer_list *t)
>   {
>   	struct intel_rps *rps = from_timer(rps, t, timer);
> -	struct intel_engine_cs *engine;
> -	ktime_t dt, last, timestamp;
> -	enum intel_engine_id id;
> +	struct intel_gt *gt = rps_to_gt(rps);
> +	ktime_t dt, last, timestamp = 0;
>   	s64 max_busy[3] = {};
> +	int i, j;
>   
> -	timestamp = 0;
> -	for_each_engine(engine, rps_to_gt(rps), id) {
> -		s64 busy;
> -		int i;
> +	/* Compare average occupancy over each engine group */
> +	for (i = 0; i < ARRAY_SIZE(gt->engine_class); i++) {
> +		s64 busy = 0;
> +		int count = 0;
> +
> +		for (j = 0; j < ARRAY_SIZE(gt->engine_class[i]); j++) {
> +			struct intel_engine_cs *engine;
>   
> -		dt = intel_engine_get_busy_time(engine, &timestamp);
> -		last = engine->stats.rps;
> -		engine->stats.rps = dt;
> +			engine = gt->engine_class[i][j];
> +			if (!engine)
> +				continue;
>   
> -		busy = ktime_to_ns(ktime_sub(dt, last));
> -		for (i = 0; i < ARRAY_SIZE(max_busy); i++) {
> -			if (busy > max_busy[i])
> -				swap(busy, max_busy[i]);
> +			dt = intel_engine_get_busy_time(engine, &timestamp);
> +			last = engine->stats.rps;
> +			engine->stats.rps = dt;
> +
> +			if (!intel_engine_pm_is_awake(engine))
> +				continue;
> +
> +			busy += ktime_to_ns(ktime_sub(dt, last));
> +			count++;
> +		}
> +
> +		if (count > 1)
> +			busy = div_u64(busy, count);
> +		if (busy <= max_busy[ARRAY_SIZE(max_busy) - 1])
> +			continue;
> +
> +		for (j = 0; j < ARRAY_SIZE(max_busy); j++) {
> +			if (busy > max_busy[j])
> +				swap(busy, max_busy[j]);
>   		}
>   	}
> +
>   	last = rps->pm_timestamp;
>   	rps->pm_timestamp = timestamp;
>   
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 07ff7ba7b2b7..3675ac93ded0 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -7,6 +7,7 @@ 
 
 #include "i915_drv.h"
 #include "intel_breadcrumbs.h"
+#include "intel_engine_pm.h"
 #include "intel_gt.h"
 #include "intel_gt_clock_utils.h"
 #include "intel_gt_irq.h"
@@ -65,26 +66,45 @@  static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
 static void rps_timer(struct timer_list *t)
 {
 	struct intel_rps *rps = from_timer(rps, t, timer);
-	struct intel_engine_cs *engine;
-	ktime_t dt, last, timestamp;
-	enum intel_engine_id id;
+	struct intel_gt *gt = rps_to_gt(rps);
+	ktime_t dt, last, timestamp = 0;
 	s64 max_busy[3] = {};
+	int i, j;
 
-	timestamp = 0;
-	for_each_engine(engine, rps_to_gt(rps), id) {
-		s64 busy;
-		int i;
+	/* Compare average occupancy over each engine group */
+	for (i = 0; i < ARRAY_SIZE(gt->engine_class); i++) {
+		s64 busy = 0;
+		int count = 0;
+
+		for (j = 0; j < ARRAY_SIZE(gt->engine_class[i]); j++) {
+			struct intel_engine_cs *engine;
 
-		dt = intel_engine_get_busy_time(engine, &timestamp);
-		last = engine->stats.rps;
-		engine->stats.rps = dt;
+			engine = gt->engine_class[i][j];
+			if (!engine)
+				continue;
 
-		busy = ktime_to_ns(ktime_sub(dt, last));
-		for (i = 0; i < ARRAY_SIZE(max_busy); i++) {
-			if (busy > max_busy[i])
-				swap(busy, max_busy[i]);
+			dt = intel_engine_get_busy_time(engine, &timestamp);
+			last = engine->stats.rps;
+			engine->stats.rps = dt;
+
+			if (!intel_engine_pm_is_awake(engine))
+				continue;
+
+			busy += ktime_to_ns(ktime_sub(dt, last));
+			count++;
+		}
+
+		if (count > 1)
+			busy = div_u64(busy, count);
+		if (busy <= max_busy[ARRAY_SIZE(max_busy) - 1])
+			continue;
+
+		for (j = 0; j < ARRAY_SIZE(max_busy); j++) {
+			if (busy > max_busy[j])
+				swap(busy, max_busy[j]);
 		}
 	}
+
 	last = rps->pm_timestamp;
 	rps->pm_timestamp = timestamp;