diff mbox series

[1/3] drm/i915: Limit C-states when waiting for the active request

Message ID 20180730152522.31682-2-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/i915: Limit C-states when waiting for the active request | expand

Commit Message

Chris Wilson July 30, 2018, 3:25 p.m. UTC
If we are waiting for the currently executing request, we have a good
idea that it will be completed in the very near future and so want to
cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.

v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
v4: Beautification?
v5: And ignore the preemptibility of queue_work before schedule.

Testcase: igt/gem_sync/store-default
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Francisco Jerez <currojerez@riseup.net>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_request.c | 52 +++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

Comments

Chris Wilson Aug. 1, 2018, 9:56 a.m. UTC | #1
Quoting Chris Wilson (2018-07-30 16:25:20)
> If we are waiting for the currently executing request, we have a good
> idea that it will be completed in the very near future and so want to
> cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
> 
> v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
> v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
> v4: Beautification?
> v5: And ignore the preemptibility of queue_work before schedule.
> 
> Testcase: igt/gem_sync/store-default
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Francisco Jerez <currojerez@riseup.net>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>

media_bench disagrees with dropping iowait, but agrees with setting the
DMA_LATENCY pm_qos.
-Chris
Tvrtko Ursulin Aug. 3, 2018, 10:48 a.m. UTC | #2
On 30/07/2018 16:25, Chris Wilson wrote:
> If we are waiting for the currently executing request, we have a good
> idea that it will be completed in the very near future and so want to
> cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.

Maybe, but I have a feeling we shouldn't assume what the userspace 
wants. On the other hand "seqno - 1" guard alleviates some of my 
concerns, just not sure if all.

> v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
> v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
> v4: Beautification?
> v5: And ignore the preemptibility of queue_work before schedule.
> 
> Testcase: igt/gem_sync/store-default
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Francisco Jerez <currojerez@riseup.net>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_request.c | 52 +++++++++++++++++++++++++++++
>   1 file changed, 52 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5c2c93cbab12..f3ff8dbe363d 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1258,6 +1258,51 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
>   	return true;
>   }
>   
> +struct wait_dma_qos {
> +	struct pm_qos_request req;
> +	struct work_struct add, del;
> +};
> +
> +static void __wait_dma_qos_add(struct work_struct *work)
> +{
> +	struct wait_dma_qos *qos = container_of(work, typeof(*qos), add);
> +
> +	pm_qos_add_request(&qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
> +}
> +
> +static void __wait_dma_qos_del(struct work_struct *work)
> +{
> +	struct wait_dma_qos *qos = container_of(work, typeof(*qos), del);
> +
> +	if (!cancel_work_sync(&qos->add))
> +		pm_qos_remove_request(&qos->req);
> +
> +	kfree(qos);
> +}
> +
> +static struct wait_dma_qos *wait_dma_qos_add(void)
> +{
> +	struct wait_dma_qos *qos;
> +
> +	/* Called under TASK_INTERRUPTIBLE, so not allowed to sleep/block. */
> +	qos = kzalloc(sizeof(*qos), GFP_NOWAIT | __GFP_NOWARN);

Is it too big to put on the stack in i915_request_wait? Looks like that 
would be simpler.

> +	if (!qos)
> +		return NULL;
> +
> +	INIT_WORK(&qos->add, __wait_dma_qos_add);
> +	INIT_WORK(&qos->del, __wait_dma_qos_del);
> +	schedule_work_on(raw_smp_processor_id(), &qos->add);

Do we want to use the highpri wq? But in any case we do have a worker 
latency here, which may completely defeat the 50us QoS request. :(

Also, do you need to specify the CPU manually or is that in fact 
detrimental to the worker running ASAP? AFAIU this makes the worker only 
be able to start once we go to sleep, with potentially other stuff in 
there preceding our work item.

> +
> +	return qos;
> +}
> +
> +static void wait_dma_qos_del(struct wait_dma_qos *qos)
> +{
> +	/* Defer to worker so not incur extra latency for our woken client. */
> +	if (qos)
> +		schedule_work(&qos->del);
> +}
> +
>   /**
>    * i915_request_wait - wait until execution of request has finished
>    * @rq: the request to wait upon
> @@ -1286,6 +1331,7 @@ long i915_request_wait(struct i915_request *rq,
>   	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
>   	DEFINE_WAIT_FUNC(reset, default_wake_function);
>   	DEFINE_WAIT_FUNC(exec, default_wake_function);
> +	struct wait_dma_qos *qos = NULL;
>   	struct intel_wait wait;
>   
>   	might_sleep();
> @@ -1363,6 +1409,11 @@ long i915_request_wait(struct i915_request *rq,
>   			break;
>   		}
>   
> +		if (!qos &&
> +		    i915_seqno_passed(intel_engine_get_seqno(rq->engine),
> +				      wait.seqno - 1))
> +			qos = wait_dma_qos_add();
> +
>   		timeout = io_schedule_timeout(timeout);
>   
>   		if (intel_wait_complete(&wait) &&
> @@ -1412,6 +1463,7 @@ long i915_request_wait(struct i915_request *rq,
>   	if (flags & I915_WAIT_LOCKED)
>   		remove_wait_queue(errq, &reset);
>   	remove_wait_queue(&rq->execute, &exec);
> +	wait_dma_qos_del(qos);
>   	trace_i915_request_wait_end(rq);
>   
>   	return timeout;
> 

Another thing we talked about on IRC is a potential to introduce an 
explicit low-latency flag to gem_wait ioctl. That would punt the 
responsibility to userspace to know if it cares, but on the other hand 
if some benchmark benefit from implicit setting that could be tempting. 
You said media-bench likes it so I'll try it.

Explicit request would also simplify the code by removing the need for 
workers. And remove the worker latency from the request which may be the 
most attractive benefit.

And also could apply the QoS request to before the seqno assignment. 
Currently I think there is a small window where wait can race with it, 
and fall into high-latency sleep, even if later it would chose to 
request low-latency.

Regards,

Tvrtko
Chris Wilson Aug. 3, 2018, 11:07 a.m. UTC | #3
Quoting Tvrtko Ursulin (2018-08-03 11:48:44)
> 
> On 30/07/2018 16:25, Chris Wilson wrote:
> > If we are waiting for the currently executing request, we have a good
> > idea that it will be completed in the very near future and so want to
> > cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
> 
> Maybe, but I have a feeling we shouldn't assume what the userspace 
> wants. On the other hand "seqno - 1" guard alleviates some of my 
> concerns, just not sure if all.

Yup, it at least pretends to not be wholly evil.
 
> > v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
> > v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
> > v4: Beautification?
> > v5: And ignore the preemptibility of queue_work before schedule.
> > 
> > Testcase: igt/gem_sync/store-default
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> > Cc: Francisco Jerez <currojerez@riseup.net>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_request.c | 52 +++++++++++++++++++++++++++++
> >   1 file changed, 52 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> > index 5c2c93cbab12..f3ff8dbe363d 100644
> > --- a/drivers/gpu/drm/i915/i915_request.c
> > +++ b/drivers/gpu/drm/i915/i915_request.c
> > @@ -1258,6 +1258,51 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
> >       return true;
> >   }
> >   
> > +struct wait_dma_qos {
> > +     struct pm_qos_request req;
> > +     struct work_struct add, del;
> > +};
> > +
> > +static void __wait_dma_qos_add(struct work_struct *work)
> > +{
> > +     struct wait_dma_qos *qos = container_of(work, typeof(*qos), add);
> > +
> > +     pm_qos_add_request(&qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
> > +}
> > +
> > +static void __wait_dma_qos_del(struct work_struct *work)
> > +{
> > +     struct wait_dma_qos *qos = container_of(work, typeof(*qos), del);
> > +
> > +     if (!cancel_work_sync(&qos->add))
> > +             pm_qos_remove_request(&qos->req);
> > +
> > +     kfree(qos);
> > +}
> > +
> > +static struct wait_dma_qos *wait_dma_qos_add(void)
> > +{
> > +     struct wait_dma_qos *qos;
> > +
> > +     /* Called under TASK_INTERRUPTIBLE, so not allowed to sleep/block. */
> > +     qos = kzalloc(sizeof(*qos), GFP_NOWAIT | __GFP_NOWARN);
> 
> Is it too big to put on the stack in i915_request_wait? Looks like that 
> would be simpler.

We don't want to be synchronous in our out path, as that directly adds
to the client latency.
 
> > +     if (!qos)
> > +             return NULL;
> > +
> > +     INIT_WORK(&qos->add, __wait_dma_qos_add);
> > +     INIT_WORK(&qos->del, __wait_dma_qos_del);
> > +     schedule_work_on(raw_smp_processor_id(), &qos->add);
> 
> Do we want to use the highpri wq? But in any case we do have a worker 
> latency here, which may completely defeat the 50us QoS request. :(
> 
> Also, do you need to specify the CPU manually or is that in fact 
> detrimental to the worker running ASAP? AFAIU this makes the worker only 
> be able to start once we go to sleep, with potentially other stuff in 
> there preceding our work item.

That was the idea with pinning it to the current cpu; that the worker is
only run when we schedule ourselves. If we skipped the schedule, we
wouldn't need to adjust the pm_qos. There is still some wiggle with
preemption, but I don't see that being a huge concern, if we are
switching cpus we may as well make sure we don't enter a high C-state.

> > +
> > +     return qos;
> > +}
> > +
> > +static void wait_dma_qos_del(struct wait_dma_qos *qos)
> > +{
> > +     /* Defer to worker so not incur extra latency for our woken client. */
> > +     if (qos)
> > +             schedule_work(&qos->del);
> > +}
> > +
> >   /**
> >    * i915_request_wait - wait until execution of request has finished
> >    * @rq: the request to wait upon
> > @@ -1286,6 +1331,7 @@ long i915_request_wait(struct i915_request *rq,
> >       wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
> >       DEFINE_WAIT_FUNC(reset, default_wake_function);
> >       DEFINE_WAIT_FUNC(exec, default_wake_function);
> > +     struct wait_dma_qos *qos = NULL;
> >       struct intel_wait wait;
> >   
> >       might_sleep();
> > @@ -1363,6 +1409,11 @@ long i915_request_wait(struct i915_request *rq,
> >                       break;
> >               }
> >   
> > +             if (!qos &&
> > +                 i915_seqno_passed(intel_engine_get_seqno(rq->engine),
> > +                                   wait.seqno - 1))
> > +                     qos = wait_dma_qos_add();
> > +
> >               timeout = io_schedule_timeout(timeout);
> >   
> >               if (intel_wait_complete(&wait) &&
> > @@ -1412,6 +1463,7 @@ long i915_request_wait(struct i915_request *rq,
> >       if (flags & I915_WAIT_LOCKED)
> >               remove_wait_queue(errq, &reset);
> >       remove_wait_queue(&rq->execute, &exec);
> > +     wait_dma_qos_del(qos);
> >       trace_i915_request_wait_end(rq);
> >   
> >       return timeout;
> > 
> 
> Another thing we talked about on IRC is a potential to introduce an 
> explicit low-latency flag to gem_wait ioctl. That would punt the 
> responsibility to userspace to know if it cares, but on the other hand 
> if some benchmark benefit from implicit setting that could be tempting. 
> You said media-bench likes it so I'll try it.
> 
> Explicit request would also simplify the code by removing the need for 
> workers. And remove the worker latency from the request which may be the 
> most attractive benefit.

No, we still need the workers as the pm_qos_update can take a while if
it decides to change C-state there and then, so definitely don't want
that in the tail, and we might as well keep the trick to only do the
pm_qos_update if we sleep.
 
> And also could apply the QoS request to before the seqno assignment. 
> Currently I think there is a small window where wait can race with it, 
> and fall into high-latency sleep, even if later it would chose to 
> request low-latency.

That window should be followed by an interrupt, I think, and we should
have applied the irq_seqno barrier for gen5-gen7, so the seqno should be
valid wrt to the previous interrupt.

A hard problem to debug for sure. Maybe we should preserve the
last-interrupt timestamp in the HWSP and then we can inspect that.
Ideally though we want a journal so we can go back in case the next
interrupt arrives before we wakeup making the latency seem less. Still
some information (with detectable error) is better than none. Let's see
what that looks like.
-Chris
Tvrtko Ursulin Aug. 3, 2018, 1 p.m. UTC | #4
On 03/08/2018 12:07, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-08-03 11:48:44)
>>
>> On 30/07/2018 16:25, Chris Wilson wrote:
>>> If we are waiting for the currently executing request, we have a good
>>> idea that it will be completed in the very near future and so want to
>>> cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
>>
>> Maybe, but I have a feeling we shouldn't assume what the userspace
>> wants. On the other hand "seqno - 1" guard alleviates some of my
>> concerns, just not sure if all.
> 
> Yup, it at least pretends to not be wholly evil.
>   
>>> v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
>>> v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
>>> v4: Beautification?
>>> v5: And ignore the preemptibility of queue_work before schedule.
>>>
>>> Testcase: igt/gem_sync/store-default
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>>> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
>>> Cc: Francisco Jerez <currojerez@riseup.net>
>>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_request.c | 52 +++++++++++++++++++++++++++++
>>>    1 file changed, 52 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
>>> index 5c2c93cbab12..f3ff8dbe363d 100644
>>> --- a/drivers/gpu/drm/i915/i915_request.c
>>> +++ b/drivers/gpu/drm/i915/i915_request.c
>>> @@ -1258,6 +1258,51 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
>>>        return true;
>>>    }
>>>    
>>> +struct wait_dma_qos {
>>> +     struct pm_qos_request req;
>>> +     struct work_struct add, del;
>>> +};
>>> +
>>> +static void __wait_dma_qos_add(struct work_struct *work)
>>> +{
>>> +     struct wait_dma_qos *qos = container_of(work, typeof(*qos), add);
>>> +
>>> +     pm_qos_add_request(&qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
>>> +}
>>> +
>>> +static void __wait_dma_qos_del(struct work_struct *work)
>>> +{
>>> +     struct wait_dma_qos *qos = container_of(work, typeof(*qos), del);
>>> +
>>> +     if (!cancel_work_sync(&qos->add))
>>> +             pm_qos_remove_request(&qos->req);
>>> +
>>> +     kfree(qos);
>>> +}
>>> +
>>> +static struct wait_dma_qos *wait_dma_qos_add(void)
>>> +{
>>> +     struct wait_dma_qos *qos;
>>> +
>>> +     /* Called under TASK_INTERRUPTIBLE, so not allowed to sleep/block. */
>>> +     qos = kzalloc(sizeof(*qos), GFP_NOWAIT | __GFP_NOWARN);
>>
>> Is it too big to put on the stack in i915_request_wait? Looks like that
>> would be simpler.
> 
> We don't want to be synchronous in our out path, as that directly adds
> to the client latency.

True, I missed the fact we wouldn't be able to return until syncing with 
the worker.

>>> +     if (!qos)
>>> +             return NULL;
>>> +
>>> +     INIT_WORK(&qos->add, __wait_dma_qos_add);
>>> +     INIT_WORK(&qos->del, __wait_dma_qos_del);
>>> +     schedule_work_on(raw_smp_processor_id(), &qos->add);
>>
>> Do we want to use the highpri wq? But in any case we do have a worker
>> latency here, which may completely defeat the 50us QoS request. :(
>>
>> Also, do you need to specify the CPU manually or is that in fact
>> detrimental to the worker running ASAP? AFAIU this makes the worker only
>> be able to start once we go to sleep, with potentially other stuff in
>> there preceding our work item.
> 
> That was the idea with pinning it to the current cpu; that the worker is
> only run when we schedule ourselves. If we skipped the schedule, we
> wouldn't need to adjust the pm_qos. There is still some wiggle with
> preemption, but I don't see that being a huge concern, if we are
> switching cpus we may as well make sure we don't enter a high C-state.

I guess it is OK, if there are preceding wq items which will delay our 
PM QoS setup they just add latency which we would get anyway after 
sleeping and before can be woken up. If I understand how these things work..

But highpri wq would still be a good move I think.

> 
>>> +
>>> +     return qos;
>>> +}
>>> +
>>> +static void wait_dma_qos_del(struct wait_dma_qos *qos)
>>> +{
>>> +     /* Defer to worker so not incur extra latency for our woken client. */
>>> +     if (qos)
>>> +             schedule_work(&qos->del);
>>> +}
>>> +
>>>    /**
>>>     * i915_request_wait - wait until execution of request has finished
>>>     * @rq: the request to wait upon
>>> @@ -1286,6 +1331,7 @@ long i915_request_wait(struct i915_request *rq,
>>>        wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
>>>        DEFINE_WAIT_FUNC(reset, default_wake_function);
>>>        DEFINE_WAIT_FUNC(exec, default_wake_function);
>>> +     struct wait_dma_qos *qos = NULL;
>>>        struct intel_wait wait;
>>>    
>>>        might_sleep();
>>> @@ -1363,6 +1409,11 @@ long i915_request_wait(struct i915_request *rq,
>>>                        break;
>>>                }
>>>    
>>> +             if (!qos &&
>>> +                 i915_seqno_passed(intel_engine_get_seqno(rq->engine),
>>> +                                   wait.seqno - 1))
>>> +                     qos = wait_dma_qos_add();
>>> +
>>>                timeout = io_schedule_timeout(timeout);
>>>    
>>>                if (intel_wait_complete(&wait) &&
>>> @@ -1412,6 +1463,7 @@ long i915_request_wait(struct i915_request *rq,
>>>        if (flags & I915_WAIT_LOCKED)
>>>                remove_wait_queue(errq, &reset);
>>>        remove_wait_queue(&rq->execute, &exec);
>>> +     wait_dma_qos_del(qos);
>>>        trace_i915_request_wait_end(rq);
>>>    
>>>        return timeout;
>>>
>>
>> Another thing we talked about on IRC is a potential to introduce an
>> explicit low-latency flag to gem_wait ioctl. That would punt the
>> responsibility to userspace to know if it cares, but on the other hand
>> if some benchmark benefit from implicit setting that could be tempting.
>> You said media-bench likes it so I'll try it.
>>
>> Explicit request would also simplify the code by removing the need for
>> workers. And remove the worker latency from the request which may be the
>> most attractive benefit.
> 
> No, we still need the workers as the pm_qos_update can take a while if
> it decides to change C-state there and then, so definitely don't want
> that in the tail, and we might as well keep the trick to only do the
> pm_qos_update if we sleep.

Okay true, but we would be able to set up the PM QoS earlier, before we 
do our own busy spin, and if on different CPU that would mean partially 
avoiding the worker latency. Which would be nice, since if we are giving 
some attempts at wake-up guarantees it would be nice to be as close as 
possible.

In the meantime I have exercised this via media-bench and it did seems 
there are gains for a few workloads (and almost no reliable 
regressions). But when ran under tracing the gains disappeared which 
makes me think either the 2s workload duration I selected is too short 
to give reliable numbers, or tracing brings a similar benefit wo/ PM QoS 
patch. Only conclusion is that more thorough testing is needed.

Regards,

Tvrtko

>> And also could apply the QoS request to before the seqno assignment.
>> Currently I think there is a small window where wait can race with it,
>> and fall into high-latency sleep, even if later it would chose to
>> request low-latency.
> 
> That window should be followed by an interrupt, I think, and we should
> have applied the irq_seqno barrier for gen5-gen7, so the seqno should be
> valid wrt to the previous interrupt.
> 
> A hard problem to debug for sure. Maybe we should preserve the
> last-interrupt timestamp in the HWSP and then we can inspect that.
> Ideally though we want a journal so we can go back in case the next
> interrupt arrives before we wakeup making the latency seem less. Still
> some information (with detectable error) is better than none. Let's see
> what that looks like.
> -Chris
>
Chris Wilson Aug. 3, 2018, 1:57 p.m. UTC | #5
Quoting Tvrtko Ursulin (2018-08-03 14:00:33)
> In the meantime I have exercised this via media-bench and it did seems 
> there are gains for a few workloads (and almost no reliable 
> regressions). But when ran under tracing the gains disappeared which 
> makes me think either the 2s workload duration I selected is too short 
> to give reliable numbers, or tracing brings a similar benefit wo/ PM QoS 
> patch. Only conclusion is that more thorough testing is needed.

Hmm, right tracing really does a number on cpufreq.

Switched to 

+       if (IS_ENABLED(CONFIG_DRM_I915_TRACE_IRQ_LATENCY)) {
+               struct drm_i915_private *dev_priv = rq->i915;
+               u32 now = I915_READ(RING_TIMESTAMP(rq->engine->mmio_base));
+
+               trace_i915_wait_irq_latency(rq,
+                                           now - intel_engine_get_timestamp(rq->engine),
+                                           rq->global_seqno == intel_engine_get_seqno(rq->engine));
+       }
+

from a plain pr_err() and the latency effect just vanished. That makes
it tricky.

        gem_sync   324 [002]   100.695373: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6882, prio=6882, delay=231
        gem_sync   324 [002]   100.695536: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6883, prio=6883, delay=219
        gem_sync   324 [002]   100.695697: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6884, prio=6884, delay=215
        gem_sync   324 [002]   100.695860: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6885, prio=6885, delay=231
        gem_sync   324 [002]   100.696040: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6886, prio=6886, delay=313
        gem_sync   324 [002]   100.696203: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6887, prio=6887, delay=218
        gem_sync   324 [002]   100.696366: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6888, prio=6888, delay=216
        gem_sync   324 [002]   100.696527: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6889, prio=6889, delay=214
        gem_sync   324 [002]   100.696689: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6890, prio=6890, delay=218
        gem_sync   324 [002]   100.696851: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6891, prio=6891, delay=215
        gem_sync   324 [002]   100.697013: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6892, prio=6892, delay=214
        gem_sync   324 [002]   100.697175: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6893, prio=6893, delay=216
        gem_sync   324 [002]   100.697336: i915:i915_wait_irq_latency: dev=0, engine=0:0, hw_id=2, ctx=20, seqno=6894, prio=6894, delay=217

from

[   14.232750] Wake up latency: 4004 cycles
[   14.233779] Wake up latency: 3236 cycles
[   14.234831] Wake up latency: 3580 cycles
[   14.235864] Wake up latency: 3220 cycles
[   14.236865] Wake up latency: 2653 cycles
[   14.237893] Wake up latency: 3209 cycles
[   14.238941] Wake up latency: 3492 cycles
[   14.239975] Wake up latency: 3227 cycles
[   14.240943] Wake up latency: 2019 cycles
[   14.241971] Wake up latency: 3199 cycles
[   14.243019] Wake up latency: 3500 cycles
[   14.244051] Wake up latency: 3198 cycles
[   14.245073] Wake up latency: 3014 cycles
[   14.246108] Wake up latency: 3238 cycles
[   14.247158] Wake up latency: 3539 cycles
[   14.248188] Wake up latency: 3200 cycles

-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5c2c93cbab12..f3ff8dbe363d 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1258,6 +1258,51 @@  static bool __i915_wait_request_check_and_reset(struct i915_request *request)
 	return true;
 }
 
+struct wait_dma_qos {
+	struct pm_qos_request req;
+	struct work_struct add, del;
+};
+
+static void __wait_dma_qos_add(struct work_struct *work)
+{
+	struct wait_dma_qos *qos = container_of(work, typeof(*qos), add);
+
+	pm_qos_add_request(&qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
+}
+
+static void __wait_dma_qos_del(struct work_struct *work)
+{
+	struct wait_dma_qos *qos = container_of(work, typeof(*qos), del);
+
+	if (!cancel_work_sync(&qos->add))
+		pm_qos_remove_request(&qos->req);
+
+	kfree(qos);
+}
+
+static struct wait_dma_qos *wait_dma_qos_add(void)
+{
+	struct wait_dma_qos *qos;
+
+	/* Called under TASK_INTERRUPTIBLE, so not allowed to sleep/block. */
+	qos = kzalloc(sizeof(*qos), GFP_NOWAIT | __GFP_NOWARN);
+	if (!qos)
+		return NULL;
+
+	INIT_WORK(&qos->add, __wait_dma_qos_add);
+	INIT_WORK(&qos->del, __wait_dma_qos_del);
+	schedule_work_on(raw_smp_processor_id(), &qos->add);
+
+	return qos;
+}
+
+static void wait_dma_qos_del(struct wait_dma_qos *qos)
+{
+	/* Defer to worker so not incur extra latency for our woken client. */
+	if (qos)
+		schedule_work(&qos->del);
+}
+
 /**
  * i915_request_wait - wait until execution of request has finished
  * @rq: the request to wait upon
@@ -1286,6 +1331,7 @@  long i915_request_wait(struct i915_request *rq,
 	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
 	DEFINE_WAIT_FUNC(reset, default_wake_function);
 	DEFINE_WAIT_FUNC(exec, default_wake_function);
+	struct wait_dma_qos *qos = NULL;
 	struct intel_wait wait;
 
 	might_sleep();
@@ -1363,6 +1409,11 @@  long i915_request_wait(struct i915_request *rq,
 			break;
 		}
 
+		if (!qos &&
+		    i915_seqno_passed(intel_engine_get_seqno(rq->engine),
+				      wait.seqno - 1))
+			qos = wait_dma_qos_add();
+
 		timeout = io_schedule_timeout(timeout);
 
 		if (intel_wait_complete(&wait) &&
@@ -1412,6 +1463,7 @@  long i915_request_wait(struct i915_request *rq,
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(errq, &reset);
 	remove_wait_queue(&rq->execute, &exec);
+	wait_dma_qos_del(qos);
 	trace_i915_request_wait_end(rq);
 
 	return timeout;