diff mbox series

[v2] drm/i915: Priority boost for new clients

Message ID 20180807072914.30047-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [v2] drm/i915: Priority boost for new clients | expand

Commit Message

Chris Wilson Aug. 7, 2018, 7:29 a.m. UTC
Taken from an idea used for FQ_CODEL, we give the first request of a
new request flows a small priority boost. These flows are likely to
correspond with short, interactive tasks and so be more latency sensitive
than the longer free running queues. As soon as the client has more than
one request in the queue, further requests are not boosted and it settles
down into ordinary steady state behaviour.  Such small kicks dramatically
help combat the starvation issue, by allowing each client the opportunity
to run even when the system is under heavy throughput load (within the
constraints of the user selected priority).

v2: Mark the preempted request as the start of a new flow, to prevent a
single client being continually gazumped by its peers.

Testcase: igt/benchmarks/rrul
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_request.c   | 16 ++++++++++++++--
 drivers/gpu/drm/i915/i915_scheduler.h |  4 +++-
 drivers/gpu/drm/i915/intel_lrc.c      | 23 ++++++++++++++++++-----
 3 files changed, 35 insertions(+), 8 deletions(-)

Comments

Tvrtko Ursulin Aug. 7, 2018, 9:08 a.m. UTC | #1
On 07/08/2018 08:29, Chris Wilson wrote:
> Taken from an idea used for FQ_CODEL, we give the first request of a
> new request flows a small priority boost. These flows are likely to
> correspond with short, interactive tasks and so be more latency sensitive
> than the longer free running queues. As soon as the client has more than
> one request in the queue, further requests are not boosted and it settles
> down into ordinary steady state behaviour.  Such small kicks dramatically
> help combat the starvation issue, by allowing each client the opportunity
> to run even when the system is under heavy throughput load (within the
> constraints of the user selected priority).
> 
> v2: Mark the preempted request as the start of a new flow, to prevent a
> single client being continually gazumped by its peers.
> 
> Testcase: igt/benchmarks/rrul
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_request.c   | 16 ++++++++++++++--
>   drivers/gpu/drm/i915/i915_scheduler.h |  4 +++-
>   drivers/gpu/drm/i915/intel_lrc.c      | 23 ++++++++++++++++++-----
>   3 files changed, 35 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index fdcd48651973..334a56daefad 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1125,8 +1125,20 @@ void i915_request_add(struct i915_request *request)
>   	 */
>   	local_bh_disable();
>   	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
> -	if (engine->schedule)
> -		engine->schedule(request, &request->gem_context->sched);
> +	if (engine->schedule) {
> +		struct i915_sched_attr attr = request->gem_context->sched;
> +
> +		/*
> +		 * Boost priorities to new clients (new request flows).
> +		 *
> +		 * Allow interactive/synchronous clients to jump ahead of
> +		 * the bulk clients. (FQ_CODEL)
> +		 */
> +		if (!prev || i915_request_completed(prev))
> +			attr.priority |= I915_PRIORITY_NEWCLIENT;
> +
> +		engine->schedule(request, &attr);
> +	}
>   	rcu_read_unlock();
>   	i915_sw_fence_commit(&request->submit);
>   	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 7edfad0abfd7..e9fb6c1d5e42 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -19,12 +19,14 @@ enum {
>   	I915_PRIORITY_INVALID = INT_MIN
>   };
>   
> -#define I915_USER_PRIORITY_SHIFT 0
> +#define I915_USER_PRIORITY_SHIFT 1
>   #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
>   
>   #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
>   #define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
>   
> +#define I915_PRIORITY_NEWCLIENT BIT(0)
> +
>   struct i915_sched_attr {
>   	/**
>   	 * @priority: execution and service priority
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 1558214eb502..6dec59872282 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -363,9 +363,9 @@ static void unwind_wa_tail(struct i915_request *rq)
>   
>   static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   {
> -	struct i915_request *rq, *rn;
> +	struct i915_request *rq, *rn, *active = NULL;
>   	struct list_head *uninitialized_var(pl);
> -	int last_prio = I915_PRIORITY_INVALID;
> +	int prio = I915_PRIORITY_INVALID;
>   
>   	lockdep_assert_held(&engine->timeline.lock);
>   
> @@ -379,12 +379,25 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   		unwind_wa_tail(rq);
>   
>   		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
> -		if (rq_prio(rq) != last_prio) {
> -			last_prio = rq_prio(rq);
> -			pl = lookup_priolist(engine, last_prio);
> +		if (rq_prio(rq) != prio) {
> +			prio = rq_prio(rq);
> +			pl = lookup_priolist(engine, prio);
>   		}
>   
>   		list_add(&rq->sched.link, pl);
> +
> +		active = rq;
> +	}
> +
> +	/*
> +	 * The active request is now effectively the start of a new client
> +	 * stream, so give it the equivalent small priority bump to prevent
> +	 * it being gazumped a second time by another peer.
> +	 */
> +	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> +		list_move_tail(&active->sched.link,
> +			       lookup_priolist(engine,
> +					       prio | I915_PRIORITY_NEWCLIENT));
>   	}
>   }
>   
> 

This sounds fair, I think I'm okay with it. Does it still work well for 
mixed media workloads?

Regards,

Tvrtko
Chris Wilson Aug. 7, 2018, 3:02 p.m. UTC | #2
Quoting Tvrtko Ursulin (2018-08-07 10:08:28)
> 
> On 07/08/2018 08:29, Chris Wilson wrote:
> > +     /*
> > +      * The active request is now effectively the start of a new client
> > +      * stream, so give it the equivalent small priority bump to prevent
> > +      * it being gazumped a second time by another peer.
> > +      */
> > +     if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> > +             list_move_tail(&active->sched.link,
> > +                            lookup_priolist(engine,
> > +                                            prio | I915_PRIORITY_NEWCLIENT));
> >       }
> >   }
> >   
> > 
> 
> This sounds fair, I think I'm okay with it. Does it still work well for 
> mixed media workloads?

Grr. Current drm-tip is scoring much much higher in fairness than I
remember, and there's no apparent improvement, even little room for
possible improvement. When in doubt, blame ksoftirqd ;)

Quite surprising though, it was (at least if memory serves) a dramatic
improvement from this patch. Time to see if the metrics do resemble what
I think they should be, and to see if I can find a good example in
media-bench.pl
-Chris
Tvrtko Ursulin Aug. 8, 2018, 12:40 p.m. UTC | #3
On 07/08/2018 16:02, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-08-07 10:08:28)
>>
>> On 07/08/2018 08:29, Chris Wilson wrote:
>>> +     /*
>>> +      * The active request is now effectively the start of a new client
>>> +      * stream, so give it the equivalent small priority bump to prevent
>>> +      * it being gazumped a second time by another peer.
>>> +      */
>>> +     if (!(prio & I915_PRIORITY_NEWCLIENT)) {
>>> +             list_move_tail(&active->sched.link,
>>> +                            lookup_priolist(engine,
>>> +                                            prio | I915_PRIORITY_NEWCLIENT));
>>>        }
>>>    }
>>>    
>>>
>>
>> This sounds fair, I think I'm okay with it. Does it still work well for
>> mixed media workloads?
> 
> Grr. Current drm-tip is scoring much much higher in fairness than I
> remember, and there's no apparent improvement, even little room for
> possible improvement. When in doubt, blame ksoftirqd ;)

Perhaps previous testing was before direct submission?

> Quite surprising though, it was (at least if memory serves) a dramatic
> improvement from this patch. Time to see if the metrics do resemble what
> I think they should be, and to see if I can find a good example in
> media-bench.pl

I should maybe rename it to wsim-bench.pl with a switch to select 
workload groups per directory or something..

Regards,

Tvrtko
Chris Wilson Aug. 8, 2018, 6:53 p.m. UTC | #4
Quoting Tvrtko Ursulin (2018-08-08 13:40:41)
> 
> On 07/08/2018 16:02, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-08-07 10:08:28)
> >>
> >> On 07/08/2018 08:29, Chris Wilson wrote:
> >>> +     /*
> >>> +      * The active request is now effectively the start of a new client
> >>> +      * stream, so give it the equivalent small priority bump to prevent
> >>> +      * it being gazumped a second time by another peer.
> >>> +      */
> >>> +     if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> >>> +             list_move_tail(&active->sched.link,
> >>> +                            lookup_priolist(engine,
> >>> +                                            prio | I915_PRIORITY_NEWCLIENT));
> >>>        }
> >>>    }
> >>>    
> >>>
> >>
> >> This sounds fair, I think I'm okay with it. Does it still work well for
> >> mixed media workloads?
> > 
> > Grr. Current drm-tip is scoring much much higher in fairness than I
> > remember, and there's no apparent improvement, even little room for
> > possible improvement. When in doubt, blame ksoftirqd ;)
> 
> Perhaps previous testing was before direct submission?

I went back and checked that, unfortunately not that simple. The
influencing factor appears to be the choice of workload. I am seeing
some now that respond favourably (by pure chance selection) but I need
to spend more time to ensure the results are stable, and see if there's
any method to the madness in selection.
 
> > Quite surprising though, it was (at least if memory serves) a dramatic
> > improvement from this patch. Time to see if the metrics do resemble what
> > I think they should be, and to see if I can find a good example in
> > media-bench.pl
> 
> I should maybe rename it to wsim-bench.pl with a switch to select 
> workload groups per directory or something..

What I'm thinking is closer to the expression I want is

diff --git a/scripts/media-bench.pl b/scripts/media-bench.pl
index ddf9c0ec..f39b1bb1 100755
--- a/scripts/media-bench.pl
+++ b/scripts/media-bench.pl
@@ -339,7 +339,6 @@ sub find_saturation_point
                } elsif ($c == 1) {
                        $swps = $wps;
                        return ($c, $wps, $swps, $wwps) if $wcnt > 1 or
-                                                          $multi_mode or
                                                           ($wps_target_param < 0 and
                                                            $wps_target == 0);
                }

We want to find the saturated peak of each wsim. We could use sim_wsim
for this.

@@ -553,7 +552,7 @@ foreach my $wrk (@saturation_workloads) {

                                        # Normalize mixed sum with sum of
                                        # individual runs.
-                                       $w *= 100;
+                                       $w *= 100 * scalar(@multi_workloads);
                                        $w /= $tot;

Knowing the saturation point of each wsim, we should be able to saturate
the engine running all simultaneous, each taking 1/N of the bw.
(Spherical cows and all that). Applying the factor of N here sets the
normalized target to 100%.

                                        # Second metric is average of each
@@ -563,10 +562,11 @@ foreach my $wrk (@saturation_workloads) {
                                        $s = 0;
                                        $widx = 0;
                                        foreach my $wrk (@multi_workloads) {
-                                               $s += 100 * $bwwps->[$widx] /
-                                                     $allwps{$wrk}->{$best_bid{$wrk}};
+                                               my $target = $allwps{$wrk}->{$best_bid{$wrk}} / scalar(@multi_workloads);
+                                               $s += 1. - abs($target - $bwwps->[$widx]) / $target;
                                                $widx++;
                                        }
+                                       $s *= 100;
                                        $s /= scalar(@multi_workloads);

This is the challenge. My idea of fairness is that each wsim got 1/N of
the bw, so should see wps of its saturated max / N. Fairness is then how
close we get to our theoretical fair slice.

Problem is the spherical cows are multiplying. It's box packing (linear
optimisation) problem, probably best to quantify the ideal fair slice
using sim_wsim and then report how unfair reality was.
-Chris
Chris Wilson Aug. 8, 2018, 7:24 p.m. UTC | #5
Quoting Chris Wilson (2018-08-08 19:53:33)
> Quoting Tvrtko Ursulin (2018-08-08 13:40:41)
> > 
> > On 07/08/2018 16:02, Chris Wilson wrote:
> > > Quoting Tvrtko Ursulin (2018-08-07 10:08:28)
> > >>
> > >> On 07/08/2018 08:29, Chris Wilson wrote:
> > >>> +     /*
> > >>> +      * The active request is now effectively the start of a new client
> > >>> +      * stream, so give it the equivalent small priority bump to prevent
> > >>> +      * it being gazumped a second time by another peer.
> > >>> +      */
> > >>> +     if (!(prio & I915_PRIORITY_NEWCLIENT)) {
> > >>> +             list_move_tail(&active->sched.link,
> > >>> +                            lookup_priolist(engine,
> > >>> +                                            prio | I915_PRIORITY_NEWCLIENT));
> > >>>        }
> > >>>    }
> > >>>    
> > >>>
> > >>
> > >> This sounds fair, I think I'm okay with it. Does it still work well for
> > >> mixed media workloads?
> > > 
> > > Grr. Current drm-tip is scoring much much higher in fairness than I
> > > remember, and there's no apparent improvement, even little room for
> > > possible improvement. When in doubt, blame ksoftirqd ;)
> > 
> > Perhaps previous testing was before direct submission?
> 
> I went back and checked that, unfortunately not that simple. The
> influencing factor appears to be the choice of workload. I am seeing
> some now that respond favourably (by pure chance selection) but I need
> to spend more time to ensure the results are stable, and see if there's
> any method to the madness in selection.

Early figures, suffering a bit from lack of transparency in the results.
(Using a pinned kbl gt4e)

TLDR;
w/o boosts:    busy balancer ('-b busy -R'): Aggregate (normalized) 89.28%; fairness 79.64%
w/  boosts:    busy balancer ('-b busy -R'): Aggregate (normalized) 108.92%; fairness 64.72%

Fairness went down (grumble) but that's a large increase in the
multiw-sim throughput.

Before:
ickle@kabylake:~/intel-gpu-tools$ sudo ./scripts/media-bench.pl -n 306765 -B busy -m -W media_nn_1080p_s1.wsim,media_nn_1080p_s2.wsim,media_nn_1080p_s3.wsim,media_nn_1080p.wsim,media_nn_480p.wsim
Workloads:
  media_nn_1080p_s1.wsim
  media_nn_1080p_s2.wsim
  media_nn_1080p_s3.wsim
  media_nn_1080p.wsim
  media_nn_480p.wsim
Balancers: busy,
Target workload duration is 10s.
Calibration tolerance is 0.01.
Multi-workload mode.
Nop calibration is 306765.

Evaluating 'media_nn_1080p_s1.wsim'... 10s is 177 workloads. (error=0.000300000000000011)
  Finding saturation points for 'media_nn_1080p_s1.wsim'...
    busy balancer ('-b busy -R'): 4 clients (31.844 wps, 17.671 wps single client, score=49.515).
    busy balancer ('-b busy -R -G'): 4 clients (31.957 wps, 17.667 wps single client, score=49.624).
    busy balancer ('-b busy -R -d'): 3 clients (31.904 wps, 17.677 wps single client, score=49.581).
    busy balancer ('-b busy -R -G -d'): 3 clients (32.046 wps, 17.624 wps single client, score=49.67).

Evaluating 'media_nn_1080p_s2.wsim'... 10s is 183 workloads. (error=0.00609999999999999)
  Finding saturation points for 'media_nn_1080p_s2.wsim'...
    busy balancer ('-b busy -R'): 3 clients (31.984 wps, 18.147 wps single client, score=50.131).
    busy balancer ('-b busy -R -G'): 3 clients (31.929 wps, 18.058 wps single client, score=49.987).
    busy balancer ('-b busy -R -d'): 2 clients (32.061 wps, 18.160 wps single client, score=50.221).
    busy balancer ('-b busy -R -G -d'): 3 clients (31.909 wps, 18.059 wps single client, score=49.968).

Evaluating 'media_nn_1080p_s3.wsim'... 10s is 182 workloads. (error=0.00719999999999992)
  Finding saturation points for 'media_nn_1080p_s3.wsim'...
    busy balancer ('-b busy -R'): 3 clients (31.957 wps, 18.044 wps single client, score=50.001).
    busy balancer ('-b busy -R -G'): 3 clients (31.930 wps, 18.134 wps single client, score=50.064).
    busy balancer ('-b busy -R -d'): 2 clients (32.047 wps, 18.054 wps single client, score=50.101).
    busy balancer ('-b busy -R -G -d'): 2 clients (31.972 wps, 18.040 wps single client, score=50.012).

Evaluating 'media_nn_1080p.wsim'... 10s is 156 workloads. (error=0.00470000000000006)
  Finding saturation points for 'media_nn_1080p.wsim'...
    busy balancer ('-b busy -R'): 4 clients (31.957 wps, 15.667 wps single client, score=47.624).
    busy balancer ('-b busy -R -G'): 4 clients (32.066 wps, 15.684 wps single client, score=47.75).
    busy balancer ('-b busy -R -d'): 5 clients (32.022 wps, 15.886 wps single client, score=47.908).
    busy balancer ('-b busy -R -G -d'): 4 clients (31.939 wps, 15.657 wps single client, score=47.596).

Evaluating 'media_nn_480p.wsim'... 10s is 337 workloads. (error=0.00820000000000007)
  Finding saturation points for 'media_nn_480p.wsim'...
    busy balancer ('-b busy -R'): 5 clients (74.077 wps, 33.412 wps single client, score=107.489).
    busy balancer ('-b busy -R -G'): 4 clients (73.849 wps, 33.415 wps single client, score=107.264).
    busy balancer ('-b busy -R -d'): 5 clients (73.357 wps, 33.319 wps single client, score=106.676).
    busy balancer ('-b busy -R -G -d'): 5 clients (73.540 wps, 33.497 wps single client, score=107.037).

Evaluating '-w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s1.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s2.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s3.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_480p.wsim'... 10s is 72 workloads. (error=0.00109999999999992)
  Finding saturation points for '-w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s1.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s2.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s3.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_480p.wsim'...
    busy balancer ('-b busy -R'): Aggregate (normalized) 89.28%; fairness 79.64%
    busy balancer ('-b busy -R -G'): Aggregate (normalized) 89.42%; fairness 79.52%
    busy balancer ('-b busy -R -d'): Aggregate (normalized) 89.60%; fairness 79.36%
    busy balancer ('-b busy -R -G -d'): Aggregate (normalized) 89.92%; fairness 79.15%
  Best balancer is '-b busy -R -G -d' (range=0.000886735101758962).

After:
ickle@kabylake:~/intel-gpu-tools$ sudo ./scripts/media-bench.pl -n 306765 -B busy -m -W media_nn_1080p_s1.wsim,media_nn_1080p_s2.wsim,media_nn_1080p_s3.wsim,media_nn_1080p.wsim,media_nn_480p.wsim
Workloads:
  media_nn_1080p_s1.wsim
  media_nn_1080p_s2.wsim
  media_nn_1080p_s3.wsim
  media_nn_1080p.wsim
  media_nn_480p.wsim
Balancers: busy,
Target workload duration is 10s.
Calibration tolerance is 0.01.
Multi-workload mode.
Nop calibration is 306765.

Evaluating 'media_nn_1080p_s1.wsim'... 10s is 177 workloads. (error=0.00180000000000007)
  Finding saturation points for 'media_nn_1080p_s1.wsim'...
    busy balancer ('-b busy -R'): 3 clients (31.846 wps, 17.674 wps single client, score=49.52).
    busy balancer ('-b busy -R -G'): 3 clients (31.929 wps, 17.668 wps single client, score=49.597).
    busy balancer ('-b busy -R -d'): 3 clients (31.874 wps, 17.620 wps single client, score=49.494).
    busy balancer ('-b busy -R -G -d'): 3 clients (31.888 wps, 17.676 wps single client, score=49.564).

Evaluating 'media_nn_1080p_s2.wsim'... 10s is 181 workloads. (error=0.000799999999999912)
  Finding saturation points for 'media_nn_1080p_s2.wsim'...
    busy balancer ('-b busy -R'): 3 clients (31.987 wps, 18.063 wps single client, score=50.05).
    busy balancer ('-b busy -R -G'): 3 clients (31.938 wps, 18.082 wps single client, score=50.02).
    busy balancer ('-b busy -R -d'): 3 clients (31.925 wps, 18.060 wps single client, score=49.985).
    busy balancer ('-b busy -R -G -d'): 2 clients (32.002 wps, 18.087 wps single client, score=50.089).

Evaluating 'media_nn_1080p_s3.wsim'... 10s is 182 workloads. (error=0.00779999999999994)
  Finding saturation points for 'media_nn_1080p_s3.wsim'...
    busy balancer ('-b busy -R'): 3 clients (32.005 wps, 18.042 wps single client, score=50.047).
    busy balancer ('-b busy -R -G'): 3 clients (32.013 wps, 18.156 wps single client, score=50.169).
    busy balancer ('-b busy -R -d'): 3 clients (32.090 wps, 18.079 wps single client, score=50.169).
    busy balancer ('-b busy -R -G -d'): 2 clients (32.094 wps, 18.038 wps single client, score=50.132).

Evaluating 'media_nn_1080p.wsim'... 10s is 156 workloads. (error=0.00459999999999994)
  Finding saturation points for 'media_nn_1080p.wsim'...
    busy balancer ('-b busy -R'): 5 clients (32.010 wps, 15.668 wps single client, score=47.678).
    busy balancer ('-b busy -R -G'): 4 clients (31.922 wps, 15.732 wps single client, score=47.654).
    busy balancer ('-b busy -R -d'): 3 clients (32.003 wps, 15.634 wps single client, score=47.637).
    busy balancer ('-b busy -R -G -d'): 4 clients (31.798 wps, 15.634 wps single client, score=47.432).

Evaluating 'media_nn_480p.wsim'... 10s is 336 workloads. (error=0.00510000000000002)
  Finding saturation points for 'media_nn_480p.wsim'...
    busy balancer ('-b busy -R'): 5 clients (73.326 wps, 33.979 wps single client, score=107.305).
    busy balancer ('-b busy -R -G'): 5 clients (73.588 wps, 33.410 wps single client, score=106.998).
    busy balancer ('-b busy -R -d'): 5 clients (73.734 wps, 33.280 wps single client, score=107.014).
    busy balancer ('-b busy -R -G -d'): 4 clients (73.139 wps, 33.768 wps single client, score=106.907).

Evaluating '-w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s1.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s2.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s3.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_480p.wsim'... 10s is 61 workloads. (error=0.00779999999999994)
  Finding saturation points for '-w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s1.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s2.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p_s3.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_1080p.wsim -w /home/ickle/intel-gpu-tools/benchmarks/wsim/media_nn_480p.wsim'...
    busy balancer ('-b busy -R'): Aggregate (normalized) 108.92%; fairness 64.72%
    busy balancer ('-b busy -R -G'): Aggregate (normalized) 109.23%; fairness 64.02%
    busy balancer ('-b busy -R -d'): Aggregate (normalized) 110.08%; fairness 55.50%
    busy balancer ('-b busy -R -G -d'): Aggregate (normalized) 108.42%; fairness 55.71%
  Best balancer is '-b busy -R' (range=0.0547794021204777).
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index fdcd48651973..334a56daefad 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1125,8 +1125,20 @@  void i915_request_add(struct i915_request *request)
 	 */
 	local_bh_disable();
 	rcu_read_lock(); /* RCU serialisation for set-wedged protection */
-	if (engine->schedule)
-		engine->schedule(request, &request->gem_context->sched);
+	if (engine->schedule) {
+		struct i915_sched_attr attr = request->gem_context->sched;
+
+		/*
+		 * Boost priorities to new clients (new request flows).
+		 *
+		 * Allow interactive/synchronous clients to jump ahead of
+		 * the bulk clients. (FQ_CODEL)
+		 */
+		if (!prev || i915_request_completed(prev))
+			attr.priority |= I915_PRIORITY_NEWCLIENT;
+
+		engine->schedule(request, &attr);
+	}
 	rcu_read_unlock();
 	i915_sw_fence_commit(&request->submit);
 	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 7edfad0abfd7..e9fb6c1d5e42 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -19,12 +19,14 @@  enum {
 	I915_PRIORITY_INVALID = INT_MIN
 };
 
-#define I915_USER_PRIORITY_SHIFT 0
+#define I915_USER_PRIORITY_SHIFT 1
 #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
 
 #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
 #define I915_PRIORITY_MASK (-I915_PRIORITY_COUNT)
 
+#define I915_PRIORITY_NEWCLIENT BIT(0)
+
 struct i915_sched_attr {
 	/**
 	 * @priority: execution and service priority
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1558214eb502..6dec59872282 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -363,9 +363,9 @@  static void unwind_wa_tail(struct i915_request *rq)
 
 static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
-	struct i915_request *rq, *rn;
+	struct i915_request *rq, *rn, *active = NULL;
 	struct list_head *uninitialized_var(pl);
-	int last_prio = I915_PRIORITY_INVALID;
+	int prio = I915_PRIORITY_INVALID;
 
 	lockdep_assert_held(&engine->timeline.lock);
 
@@ -379,12 +379,25 @@  static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		unwind_wa_tail(rq);
 
 		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
-		if (rq_prio(rq) != last_prio) {
-			last_prio = rq_prio(rq);
-			pl = lookup_priolist(engine, last_prio);
+		if (rq_prio(rq) != prio) {
+			prio = rq_prio(rq);
+			pl = lookup_priolist(engine, prio);
 		}
 
 		list_add(&rq->sched.link, pl);
+
+		active = rq;
+	}
+
+	/*
+	 * The active request is now effectively the start of a new client
+	 * stream, so give it the equivalent small priority bump to prevent
+	 * it being gazumped a second time by another peer.
+	 */
+	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
+		list_move_tail(&active->sched.link,
+			       lookup_priolist(engine,
+					       prio | I915_PRIORITY_NEWCLIENT));
 	}
 }