diff mbox series

[v3] drm/i915: Limit C-states when waiting for the active request

Message ID 20180730123954.6249-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [v3] drm/i915: Limit C-states when waiting for the active request | expand

Commit Message

Chris Wilson July 30, 2018, 12:39 p.m. UTC
If we are waiting for the currently executing request, we have a good
idea that it will be completed in the very near future and so want to
cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.

v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE

Testcase: igt/gem_sync/store-default
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Francisco Jerez <currojerez@riseup.net>
---
 drivers/gpu/drm/i915/i915_request.c | 38 +++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

Comments

Mika Kuoppala July 30, 2018, 1:07 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> If we are waiting for the currently executing request, we have a good
> idea that it will be completed in the very near future and so want to
> cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
>
> v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
> v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
>
> Testcase: igt/gem_sync/store-default
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Francisco Jerez <currojerez@riseup.net>
> ---
>  drivers/gpu/drm/i915/i915_request.c | 38 +++++++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5c2c93cbab12..7c7746ef0d1b 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1258,6 +1258,28 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
>  	return true;
>  }
>  
> +struct pm_qos {
> +	struct pm_qos_request req;
> +	struct work_struct add, del;
> +};
> +
> +static void pm_qos_add(struct work_struct *work)
> +{
> +	struct pm_qos *pm_qos = container_of(work, typeof(*pm_qos), add);
> +
> +	pm_qos_add_request(&pm_qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
> +}
> +
> +static void pm_qos_del(struct work_struct *work)
> +{
> +	struct pm_qos *pm_qos = container_of(work, typeof(*pm_qos), del);
> +
> +	if (!cancel_work_sync(&pm_qos->add))
> +		pm_qos_remove_request(&pm_qos->req);
> +
> +	kfree(pm_qos);
> +}
> +
>  /**
>   * i915_request_wait - wait until execution of request has finished
>   * @rq: the request to wait upon
> @@ -1286,6 +1308,7 @@ long i915_request_wait(struct i915_request *rq,
>  	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
>  	DEFINE_WAIT_FUNC(reset, default_wake_function);
>  	DEFINE_WAIT_FUNC(exec, default_wake_function);
> +	struct pm_qos *pm_qos = NULL;
>  	struct intel_wait wait;
>  
>  	might_sleep();
> @@ -1363,6 +1386,19 @@ long i915_request_wait(struct i915_request *rq,
>  			break;
>  		}
>  
> +		if (!pm_qos &&
> +		    i915_seqno_passed(intel_engine_get_seqno(rq->engine),
> +				      wait.seqno - 1)) {
> +			pm_qos = kzalloc(sizeof(*pm_qos),
> +					 GFP_NOWAIT | __GFP_NOWARN);
> +			if (pm_qos) {
> +				INIT_WORK(&pm_qos->add, pm_qos_add);
> +				INIT_WORK(&pm_qos->del, pm_qos_del);
> +				schedule_work_on(smp_processor_id(),
> +						 &pm_qos->add);
> +			}

My immediate thoughts are on why this and not preallocate pm_qos
on init/fini and just add/remove requests on suitable spots?

-Mika


> +		}
> +
>  		timeout = io_schedule_timeout(timeout);
>  
>  		if (intel_wait_complete(&wait) &&
> @@ -1412,6 +1448,8 @@ long i915_request_wait(struct i915_request *rq,
>  	if (flags & I915_WAIT_LOCKED)
>  		remove_wait_queue(errq, &reset);
>  	remove_wait_queue(&rq->execute, &exec);
> +	if (pm_qos)
> +		schedule_work(&pm_qos->del);
>  	trace_i915_request_wait_end(rq);
>  
>  	return timeout;
> -- 
> 2.18.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson July 30, 2018, 1:27 p.m. UTC | #2
Quoting Mika Kuoppala (2018-07-30 14:07:07)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > If we are waiting for the currently executing request, we have a good
> > idea that it will be completed in the very near future and so want to
> > cap the CPU_DMA_LATENCY to ensure that we wake up the client quickly.
> >
> > v2: Not allowed to block in kmalloc after setting TASK_INTERRUPTIBLE.
> > v3: Avoid the blocking notifier as well for TASK_INTERRUPTIBLE
> >
> > Testcase: igt/gem_sync/store-default
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> > Cc: Francisco Jerez <currojerez@riseup.net>
> > ---
> >  drivers/gpu/drm/i915/i915_request.c | 38 +++++++++++++++++++++++++++++
> >  1 file changed, 38 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> > index 5c2c93cbab12..7c7746ef0d1b 100644
> > --- a/drivers/gpu/drm/i915/i915_request.c
> > +++ b/drivers/gpu/drm/i915/i915_request.c
> > @@ -1258,6 +1258,28 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
> >       return true;
> >  }
> >  
> > +struct pm_qos {
> > +     struct pm_qos_request req;
> > +     struct work_struct add, del;
> > +};
> > +
> > +static void pm_qos_add(struct work_struct *work)
> > +{
> > +     struct pm_qos *pm_qos = container_of(work, typeof(*pm_qos), add);
> > +
> > +     pm_qos_add_request(&pm_qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
> > +}
> > +
> > +static void pm_qos_del(struct work_struct *work)
> > +{
> > +     struct pm_qos *pm_qos = container_of(work, typeof(*pm_qos), del);
> > +
> > +     if (!cancel_work_sync(&pm_qos->add))
> > +             pm_qos_remove_request(&pm_qos->req);
> > +
> > +     kfree(pm_qos);
> > +}
> > +
> >  /**
> >   * i915_request_wait - wait until execution of request has finished
> >   * @rq: the request to wait upon
> > @@ -1286,6 +1308,7 @@ long i915_request_wait(struct i915_request *rq,
> >       wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
> >       DEFINE_WAIT_FUNC(reset, default_wake_function);
> >       DEFINE_WAIT_FUNC(exec, default_wake_function);
> > +     struct pm_qos *pm_qos = NULL;
> >       struct intel_wait wait;
> >  
> >       might_sleep();
> > @@ -1363,6 +1386,19 @@ long i915_request_wait(struct i915_request *rq,
> >                       break;
> >               }
> >  
> > +             if (!pm_qos &&
> > +                 i915_seqno_passed(intel_engine_get_seqno(rq->engine),
> > +                                   wait.seqno - 1)) {
> > +                     pm_qos = kzalloc(sizeof(*pm_qos),
> > +                                      GFP_NOWAIT | __GFP_NOWARN);
> > +                     if (pm_qos) {
> > +                             INIT_WORK(&pm_qos->add, pm_qos_add);
> > +                             INIT_WORK(&pm_qos->del, pm_qos_del);
> > +                             schedule_work_on(smp_processor_id(),
> > +                                              &pm_qos->add);
> > +                     }
> 
> My immediate thoughts are on why this and not preallocate pm_qos
> on init/fini and just add/remove requests on suitable spots?

For simplicity, we want multiple requests. Now we could go the global
route, to save on the kmalloc/kfree; I wasn't sure if that was worth it.
The principle cost is on the kmalloc from the per-cpu slab, which we
only need before sleeping after having already checked with a short spin
that the request isn't likely to complete immediately. We would still
need to use workers to avoid the blocking notifier, especially in client
wakeup.

The way it is structured should mean that only one waiter at a time on
each engine will need to allocate, so shouldn't be frequent enough to
worry about (certainly gem_sync --run store-default isn't too alarming,
that's the test case that goes from 900us to 170us on bxt, and is
hitting the kmalloc on every iteration, and has no more latency than if
we purely busy-waited.)

The only critical factor then is readability. Is it terrible? Maybe a
small pm_qos = pm_qos_create(). And a better name than pm_qos so that we
associate it with the wait: struct wait_dma_qos?
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5c2c93cbab12..7c7746ef0d1b 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1258,6 +1258,28 @@  static bool __i915_wait_request_check_and_reset(struct i915_request *request)
 	return true;
 }
 
+struct pm_qos {
+	struct pm_qos_request req;
+	struct work_struct add, del;
+};
+
+static void pm_qos_add(struct work_struct *work)
+{
+	struct pm_qos *pm_qos = container_of(work, typeof(*pm_qos), add);
+
+	pm_qos_add_request(&pm_qos->req, PM_QOS_CPU_DMA_LATENCY, 50);
+}
+
+static void pm_qos_del(struct work_struct *work)
+{
+	struct pm_qos *pm_qos = container_of(work, typeof(*pm_qos), del);
+
+	if (!cancel_work_sync(&pm_qos->add))
+		pm_qos_remove_request(&pm_qos->req);
+
+	kfree(pm_qos);
+}
+
 /**
  * i915_request_wait - wait until execution of request has finished
  * @rq: the request to wait upon
@@ -1286,6 +1308,7 @@  long i915_request_wait(struct i915_request *rq,
 	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
 	DEFINE_WAIT_FUNC(reset, default_wake_function);
 	DEFINE_WAIT_FUNC(exec, default_wake_function);
+	struct pm_qos *pm_qos = NULL;
 	struct intel_wait wait;
 
 	might_sleep();
@@ -1363,6 +1386,19 @@  long i915_request_wait(struct i915_request *rq,
 			break;
 		}
 
+		if (!pm_qos &&
+		    i915_seqno_passed(intel_engine_get_seqno(rq->engine),
+				      wait.seqno - 1)) {
+			pm_qos = kzalloc(sizeof(*pm_qos),
+					 GFP_NOWAIT | __GFP_NOWARN);
+			if (pm_qos) {
+				INIT_WORK(&pm_qos->add, pm_qos_add);
+				INIT_WORK(&pm_qos->del, pm_qos_del);
+				schedule_work_on(smp_processor_id(),
+						 &pm_qos->add);
+			}
+		}
+
 		timeout = io_schedule_timeout(timeout);
 
 		if (intel_wait_complete(&wait) &&
@@ -1412,6 +1448,8 @@  long i915_request_wait(struct i915_request *rq,
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(errq, &reset);
 	remove_wait_queue(&rq->execute, &exec);
+	if (pm_qos)
+		schedule_work(&pm_qos->del);
 	trace_i915_request_wait_end(rq);
 
 	return timeout;