diff mbox series

[1/6] drm/i915: Limit C-states when waiting for the active request

Message ID 20180806083017.32215-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/6] drm/i915: Limit C-states when waiting for the active request | expand

Commit Message

Chris Wilson Aug. 6, 2018, 8:30 a.m. UTC
If we are waiting for the currently executing request, we have a good
idea that it will be completed in the very near future and so want to
cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.

v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
v4: Beautification?
v5: And ignore the preemptibility of queue_work before schedule.
v6: Use highpri wq to keep our pm_qos window as small as possible.

Testcase: igt/gem_sync/store-default
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Francisco Jerez <currojerez@riseup.net>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 drivers/gpu/drm/i915/i915_request.c | 59 +++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

Comments

Tvrtko Ursulin Aug. 6, 2018, 9:34 a.m. UTC | #1
On 06/08/2018 09:30, Chris Wilson wrote:
> If we are waiting for the currently executing request, we have a good
> idea that it will be completed in the very near future and so want to
> cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.

I cannot shake the opinion that we shouldn't be doing this. For instance 
what if the client has been re-niced (down), or it has re-niced itself? 
Obviously wrong to apply this for those.

Or when you say we have a good idea something will be completed in the 
very near future. Say there is a 60fps workload which is sending 5ms 
batches and waits on them. That would be 30% of time spent outside of 
low C states for a workload which doesn't need it.

Also having read what the OpenCL does, where they want to apply 
different wait optimisations for different call-sites, the idea that we 
should instead be introducing a low-latency flag to wait ioctl sounds 
more appropriate.

> 
> v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
> v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
> v4: Beautification?
> v5: And ignore the preemptibility of queue_work before schedule.
> v6: Use highpri wq to keep our pm_qos window as small as possible.
> 
> Testcase: igt/gem_sync/store-default
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Francisco Jerez <currojerez@riseup.net>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_request.c | 59 +++++++++++++++++++++++++++++
>   1 file changed, 59 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5c2c93cbab12..67fd2ec75d78 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1258,6 +1258,58 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
>   	return true;
>   }
>   
> +struct wait_dma_qos {
> +	struct pm_qos_request req;
> +	struct work_struct add, del;
> +};
> +
> +static void __wait_dma_qos_add(struct work_struct *work)
> +{
> +	struct wait_dma_qos *qos = container_of(work, typeof(*qos), add);
> +
> +	pm_qos_add_request(&qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
> +}
> +
> +static void __wait_dma_qos_del(struct work_struct *work)
> +{
> +	struct wait_dma_qos *qos = container_of(work, typeof(*qos), del);
> +
> +	if (!cancel_work_sync(&qos->add))
> +		pm_qos_remove_request(&qos->req);
> +
> +	kfree(qos);
> +}
> +
> +static struct wait_dma_qos *wait_dma_qos_add(void)
> +{
> +	struct wait_dma_qos *qos;
> +
> +	/* Called under TASK_INTERRUPTIBLE, so not allowed to sleep/block. */
> +	qos = kzalloc(sizeof(*qos), GFP_NOWAIT | __GFP_NOWARN);
> +	if (!qos)
> +		return NULL;
> +
> +	INIT_WORK(&qos->add, __wait_dma_qos_add);
> +	INIT_WORK(&qos->del, __wait_dma_qos_del);
> +
> +	/*
> +	 * Schedule the enabling work on the local cpu so that it should only
> +	 * take effect if we actually sleep. If schedule() short circuits due to
> +	 * our request already being completed, we should then be able to cancel
> +	 * the work before it is even run.
> +	 */
> +	queue_work_on(raw_smp_processor_id(), system_highpri_wq, &qos->add);
> +
> +	return qos;
> +}
> +
> +static void wait_dma_qos_del(struct wait_dma_qos *qos)
> +{
> +	/* Defer to worker so not incur extra latency for our woken client. */
> +	if (qos)
> +		queue_work(system_highpri_wq, &qos->del);
> +}
> +
>   /**
>    * i915_request_wait - wait until execution of request has finished
>    * @rq: the request to wait upon
> @@ -1286,6 +1338,7 @@ long i915_request_wait(struct i915_request *rq,
>   	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
>   	DEFINE_WAIT_FUNC(reset, default_wake_function);
>   	DEFINE_WAIT_FUNC(exec, default_wake_function);
> +	struct wait_dma_qos *qos = NULL;
>   	struct intel_wait wait;
>   
>   	might_sleep();
> @@ -1363,6 +1416,11 @@ long i915_request_wait(struct i915_request *rq,
>   			break;
>   		}
>   
> +		if (!qos &&
> +		    i915_seqno_passed(intel_engine_get_seqno(rq->engine),
> +				      wait.seqno - 1))

I also realized that this will get incorrectly applied when there is 
preemption. If a low-priority request gets preempted after we applied 
the PM QoS it will persist for much longer than intended. (Until the 
high-prio request completes and then low-prio one.) And the explicit 
low-latency wait flag would have the same problem. We could perhaps go 
with removing the PM QoS request if preempted. It should not be frequent 
enough to cause issue with too much traffic on the API. But

Another side note - quick grep shows there are a few other "seqno - 1" 
callsites so perhaps we should add a helper for this with a more 
self-explanatory like __i915_seqno_is_executing(engine, seqno) or something?

> +			qos = wait_dma_qos_add();
> +
>   		timeout = io_schedule_timeout(timeout);
>   
>   		if (intel_wait_complete(&wait) &&
> @@ -1412,6 +1470,7 @@ long i915_request_wait(struct i915_request *rq,
>   	if (flags & I915_WAIT_LOCKED)
>   		remove_wait_queue(errq, &reset);
>   	remove_wait_queue(&rq->execute, &exec);
> +	wait_dma_qos_del(qos);
>   	trace_i915_request_wait_end(rq);
>   
>   	return timeout;
>
Regards,

Tvrtko
Chris Wilson Aug. 6, 2018, 9:59 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-08-06 10:34:54)
> 
> On 06/08/2018 09:30, Chris Wilson wrote:
> > If we are waiting for the currently executing request, we have a good
> > idea that it will be completed in the very near future and so want to
> > cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
> 
> I cannot shake the opinion that we shouldn't be doing this. For instance 
> what if the client has been re-niced (down), or it has re-niced itself? 
> Obviously wrong to apply this for those.

Niceness only restricts its position on the scheduler runqueue, doesn't
actually have any cpufreq implications (give or take RT heuristics).
So I don't think we need a tsk->prio restriction.

> Or when you say we have a good idea something will be completed in the 
> very near future. Say there is a 60fps workload which is sending 5ms 
> batches and waits on them. That would be 30% of time spent outside of 
> low C states for a workload which doesn't need it.

Quite frankly, they shouldn't be using wait on the current frame. For
example, in mesa you wait for the end of the previous frame which should
be roughly complete, and since it is a stall before computing the next,
latency is still important.
 
> Also having read what the OpenCL does, where they want to apply 
> different wait optimisations for different call-sites, the idea that we 
> should instead be introducing a low-latency flag to wait ioctl sounds 
> more appropriate.

I'm not impressed by what I've heard there yet. There's also the
dilemma with what to do with dma-fence poll().

> > +             if (!qos &&
> > +                 i915_seqno_passed(intel_engine_get_seqno(rq->engine),
> > +                                   wait.seqno - 1))
> 
> I also realized that this will get incorrectly applied when there is 
> preemption. If a low-priority request gets preempted after we applied 
> the PM QoS it will persist for much longer than intended. (Until the 
> high-prio request completes and then low-prio one.) And the explicit 
> low-latency wait flag would have the same problem. We could perhaps go 
> with removing the PM QoS request if preempted. It should not be frequent 
> enough to cause issue with too much traffic on the API. But

Sure, I didn't think it was worth worrying about. We could cancel it and
reset it on next execution.
 
> Another side note - quick grep shows there are a few other "seqno - 1" 
> callsites so perhaps we should add a helper for this with a more 
> self-explanatory like __i915_seqno_is_executing(engine, seqno) or something?

I briefly considered something along those lines,
intel_engine_has_signaled(), intel_engine_has_started. I also noticed
that I didn't kill i915_request_started even though I though we had.
-Chris
Tvrtko Ursulin Aug. 6, 2018, 10:28 a.m. UTC | #3
On 06/08/2018 10:59, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-08-06 10:34:54)
>>
>> On 06/08/2018 09:30, Chris Wilson wrote:
>>> If we are waiting for the currently executing request, we have a good
>>> idea that it will be completed in the very near future and so want to
>>> cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
>>
>> I cannot shake the opinion that we shouldn't be doing this. For instance
>> what if the client has been re-niced (down), or it has re-niced itself?
>> Obviously wrong to apply this for those.
> 
> Niceness only restricts its position on the scheduler runqueue, doesn't
> actually have any cpufreq implications (give or take RT heuristics).
> So I don't think we need a tsk->prio restriction.

I was thinking the client obviously doesn't care about latency or 
anything (more or less) so we would be incorrectly applying the PM QoS 
request.

>> Or when you say we have a good idea something will be completed in the
>> very near future. Say there is a 60fps workload which is sending 5ms
>> batches and waits on them. That would be 30% of time spent outside of
>> low C states for a workload which doesn't need it.
> 
> Quite frankly, they shouldn't be using wait on the current frame. For
> example, in mesa you wait for the end of the previous frame which should
> be roughly complete, and since it is a stall before computing the next,
> latency is still important.

Maybe, but I think we shouldn't go into details on how a client might 
use it, and create limitations / hidden gotchas on this level if they do 
not behave as we expect / prescribe.

But I have noticed so far you have been avoiding to comment on the idea 
of explicit flag. :)

Regards,

Tvrtko

>   
>> Also having read what the OpenCL does, where they want to apply
>> different wait optimisations for different call-sites, the idea that we
>> should instead be introducing a low-latency flag to wait ioctl sounds
>> more appropriate.
> 
> I'm not impressed by what I've heard there yet. There's also the
> dilemma with what to do with dma-fence poll().
> 
>>> +             if (!qos &&
>>> +                 i915_seqno_passed(intel_engine_get_seqno(rq->engine),
>>> +                                   wait.seqno - 1))
>>
>> I also realized that this will get incorrectly applied when there is
>> preemption. If a low-priority request gets preempted after we applied
>> the PM QoS it will persist for much longer than intended. (Until the
>> high-prio request completes and then low-prio one.) And the explicit
>> low-latency wait flag would have the same problem. We could perhaps go
>> with removing the PM QoS request if preempted. It should not be frequent
>> enough to cause issue with too much traffic on the API. But
> 
> Sure, I didn't think it was worth worrying about. We could cancel it and
> reset it on next execution.
>   
>> Another side note - quick grep shows there are a few other "seqno - 1"
>> callsites so perhaps we should add a helper for this with a more
>> self-explanatory like __i915_seqno_is_executing(engine, seqno) or something?
> 
> I briefly considered something along those lines,
> intel_engine_has_signaled(), intel_engine_has_started. I also noticed
> that I didn't kill i915_request_started even though I though we had.
> -Chris
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5c2c93cbab12..67fd2ec75d78 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1258,6 +1258,58 @@  static bool __i915_wait_request_check_and_reset(struct i915_request *request)
 	return true;
 }
 
+struct wait_dma_qos {
+	struct pm_qos_request req;
+	struct work_struct add, del;
+};
+
+static void __wait_dma_qos_add(struct work_struct *work)
+{
+	struct wait_dma_qos *qos = container_of(work, typeof(*qos), add);
+
+	pm_qos_add_request(&qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
+}
+
+static void __wait_dma_qos_del(struct work_struct *work)
+{
+	struct wait_dma_qos *qos = container_of(work, typeof(*qos), del);
+
+	if (!cancel_work_sync(&qos->add))
+		pm_qos_remove_request(&qos->req);
+
+	kfree(qos);
+}
+
+static struct wait_dma_qos *wait_dma_qos_add(void)
+{
+	struct wait_dma_qos *qos;
+
+	/* Called under TASK_INTERRUPTIBLE, so not allowed to sleep/block. */
+	qos = kzalloc(sizeof(*qos), GFP_NOWAIT | __GFP_NOWARN);
+	if (!qos)
+		return NULL;
+
+	INIT_WORK(&qos->add, __wait_dma_qos_add);
+	INIT_WORK(&qos->del, __wait_dma_qos_del);
+
+	/*
+	 * Schedule the enabling work on the local cpu so that it should only
+	 * take effect if we actually sleep. If schedule() short circuits due to
+	 * our request already being completed, we should then be able to cancel
+	 * the work before it is even run.
+	 */
+	queue_work_on(raw_smp_processor_id(), system_highpri_wq, &qos->add);
+
+	return qos;
+}
+
+static void wait_dma_qos_del(struct wait_dma_qos *qos)
+{
+	/* Defer to worker so not incur extra latency for our woken client. */
+	if (qos)
+		queue_work(system_highpri_wq, &qos->del);
+}
+
 /**
  * i915_request_wait - wait until execution of request has finished
  * @rq: the request to wait upon
@@ -1286,6 +1338,7 @@  long i915_request_wait(struct i915_request *rq,
 	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
 	DEFINE_WAIT_FUNC(reset, default_wake_function);
 	DEFINE_WAIT_FUNC(exec, default_wake_function);
+	struct wait_dma_qos *qos = NULL;
 	struct intel_wait wait;
 
 	might_sleep();
@@ -1363,6 +1416,11 @@  long i915_request_wait(struct i915_request *rq,
 			break;
 		}
 
+		if (!qos &&
+		    i915_seqno_passed(intel_engine_get_seqno(rq->engine),
+				      wait.seqno - 1))
+			qos = wait_dma_qos_add();
+
 		timeout = io_schedule_timeout(timeout);
 
 		if (intel_wait_complete(&wait) &&
@@ -1412,6 +1470,7 @@  long i915_request_wait(struct i915_request *rq,
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(errq, &reset);
 	remove_wait_queue(&rq->execute, &exec);
+	wait_dma_qos_del(qos);
 	trace_i915_request_wait_end(rq);
 
 	return timeout;