Message ID | 20180625094842.8499-12-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Chris Wilson <chris@chris-wilson.co.uk> writes: > By taking advantage of the RCU protection of the task struct, we can find > the appropriate signaler under the spinlock and then release the spinlock > before waking the task and signaling the fence. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++----------- > 1 file changed, 22 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 316d0b08d40f..53dad48f92ce 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -1145,21 +1145,23 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv) > > static void notify_ring(struct intel_engine_cs *engine) > { > + const u32 seqno = intel_engine_get_seqno(engine); > struct i915_request *rq = NULL; > + struct task_struct *tsk = NULL; > struct intel_wait *wait; > > - if (!engine->breadcrumbs.irq_armed) > + if (unlikely(!engine->breadcrumbs.irq_armed)) > return; > Ok, so due to unlikeliness, you get the seqno early. > atomic_inc(&engine->irq_count); > - set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted); > + > + rcu_read_lock(); As I understand from irc discussion, we have our own const or stable copy of task struct from now on. > > spin_lock(&engine->breadcrumbs.irq_lock); > wait = engine->breadcrumbs.irq_wait; > if (wait) { > - bool wakeup = engine->irq_seqno_barrier; > - > - /* We use a callback from the dma-fence to submit > + /* > + * We use a callback from the dma-fence to submit > * requests after waiting on our own requests. To > * ensure minimum delay in queuing the next request to > * hardware, signal the fence now rather than wait for > @@ -1170,19 +1172,23 @@ static void notify_ring(struct intel_engine_cs *engine) > * and to handle coalescing of multiple seqno updates > * and many waiters. > */ > - if (i915_seqno_passed(intel_engine_get_seqno(engine), > - wait->seqno)) { > + if (i915_seqno_passed(seqno, wait->seqno)) { > struct i915_request *waiter = wait->request; > > - wakeup = true; > if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, > &waiter->fence.flags) && > intel_wait_check_request(wait, waiter)) > rq = i915_request_get(waiter); > - } > > - if (wakeup) > - wake_up_process(wait->tsk); > + tsk = wait->tsk; > + } else { > + if (engine->irq_seqno_barrier && > + i915_seqno_passed(seqno, wait->seqno - 1)) { > + set_bit(ENGINE_IRQ_BREADCRUMB, > + &engine->irq_posted); > + tsk = wait->tsk; Hmm, you are optimistic that the latency of wakeup will be on par or greater than the next request completion? And wait side notices too that we are close and spins, instead of going back to sleep? > + } > + } > } else { > if (engine->breadcrumbs.irq_armed) > __intel_engine_disarm_breadcrumbs(engine); > @@ -1195,6 +1201,11 @@ static void notify_ring(struct intel_engine_cs *engine) > i915_request_put(rq); > } > > + if (tsk && tsk->state & TASK_NORMAL) > + wake_up_process(tsk); > + Why the TASK_NORMAL check? -Mika > + rcu_read_unlock(); > + > trace_intel_engine_notify(engine, wait); > } > > -- > 2.18.0
Quoting Mika Kuoppala (2018-06-27 14:08:34) > Chris Wilson <chris@chris-wilson.co.uk> writes: > > > By taking advantage of the RCU protection of the task struct, we can find > > the appropriate signaler under the spinlock and then release the spinlock > > before waking the task and signaling the fence. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > --- > > drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++----------- > > 1 file changed, 22 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > > index 316d0b08d40f..53dad48f92ce 100644 > > --- a/drivers/gpu/drm/i915/i915_irq.c > > +++ b/drivers/gpu/drm/i915/i915_irq.c > > @@ -1145,21 +1145,23 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv) > > > > static void notify_ring(struct intel_engine_cs *engine) > > { > > + const u32 seqno = intel_engine_get_seqno(engine); > > struct i915_request *rq = NULL; > > + struct task_struct *tsk = NULL; > > struct intel_wait *wait; > > > > - if (!engine->breadcrumbs.irq_armed) > > + if (unlikely(!engine->breadcrumbs.irq_armed)) > > return; > > > > Ok, so due to unlikeliness, you get the seqno early. > > > atomic_inc(&engine->irq_count); > > - set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted); > > + > > + rcu_read_lock(); > > As I understand from irc discussion, we have our own const > or stable copy of task struct from now on. > > > > spin_lock(&engine->breadcrumbs.irq_lock); > > wait = engine->breadcrumbs.irq_wait; > > if (wait) { > > - bool wakeup = engine->irq_seqno_barrier; > > - > > - /* We use a callback from the dma-fence to submit > > + /* > > + * We use a callback from the dma-fence to submit > > * requests after waiting on our own requests. To > > * ensure minimum delay in queuing the next request to > > * hardware, signal the fence now rather than wait for > > @@ -1170,19 +1172,23 @@ static void notify_ring(struct intel_engine_cs *engine) > > * and to handle coalescing of multiple seqno updates > > * and many waiters. > > */ > > - if (i915_seqno_passed(intel_engine_get_seqno(engine), > > - wait->seqno)) { > > + if (i915_seqno_passed(seqno, wait->seqno)) { > > struct i915_request *waiter = wait->request; > > > > - wakeup = true; > > if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, > > &waiter->fence.flags) && > > intel_wait_check_request(wait, waiter)) > > rq = i915_request_get(waiter); > > - } > > > > - if (wakeup) > > - wake_up_process(wait->tsk); > > + tsk = wait->tsk; > > + } else { > > + if (engine->irq_seqno_barrier && > > + i915_seqno_passed(seqno, wait->seqno - 1)) { > > + set_bit(ENGINE_IRQ_BREADCRUMB, > > + &engine->irq_posted); > > + tsk = wait->tsk; > > Hmm, you are optimistic that the latency of wakeup will be on par > or greater than the next request completion? > > And wait side notices too that we are close and spins, > instead of going back to sleep? Don't forget this is the missed breadcrumb mitigation for gen5-gen7, and only for it. The intent is to keep that away from the paths that do not need the extra delays. > > + } > > + } > > } else { > > if (engine->breadcrumbs.irq_armed) > > __intel_engine_disarm_breadcrumbs(engine); > > @@ -1195,6 +1201,11 @@ static void notify_ring(struct intel_engine_cs *engine) > > i915_request_put(rq); > > } > > > > + if (tsk && tsk->state & TASK_NORMAL) > > + wake_up_process(tsk); > > + > > Why the TASK_NORMAL check? *shrug* too long spent watching the signaler on gen6, i.e. I grown accustomed to expecting the tsk to be running at this point. -Chris
Chris Wilson <chris@chris-wilson.co.uk> writes: > Quoting Mika Kuoppala (2018-06-27 14:08:34) >> Chris Wilson <chris@chris-wilson.co.uk> writes: >> >> > By taking advantage of the RCU protection of the task struct, we can find >> > the appropriate signaler under the spinlock and then release the spinlock >> > before waking the task and signaling the fence. >> > >> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> >> > --- >> > drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++----------- >> > 1 file changed, 22 insertions(+), 11 deletions(-) >> > >> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c >> > index 316d0b08d40f..53dad48f92ce 100644 >> > --- a/drivers/gpu/drm/i915/i915_irq.c >> > +++ b/drivers/gpu/drm/i915/i915_irq.c >> > @@ -1145,21 +1145,23 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv) >> > >> > static void notify_ring(struct intel_engine_cs *engine) >> > { >> > + const u32 seqno = intel_engine_get_seqno(engine); >> > struct i915_request *rq = NULL; >> > + struct task_struct *tsk = NULL; >> > struct intel_wait *wait; >> > >> > - if (!engine->breadcrumbs.irq_armed) >> > + if (unlikely(!engine->breadcrumbs.irq_armed)) >> > return; >> > >> >> Ok, so due to unlikeliness, you get the seqno early. >> >> > atomic_inc(&engine->irq_count); >> > - set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted); >> > + >> > + rcu_read_lock(); >> >> As I understand from irc discussion, we have our own const >> or stable copy of task struct from now on. >> > >> > spin_lock(&engine->breadcrumbs.irq_lock); >> > wait = engine->breadcrumbs.irq_wait; >> > if (wait) { >> > - bool wakeup = engine->irq_seqno_barrier; >> > - >> > - /* We use a callback from the dma-fence to submit >> > + /* >> > + * We use a callback from the dma-fence to submit >> > * requests after waiting on our own requests. To >> > * ensure minimum delay in queuing the next request to >> > * hardware, signal the fence now rather than wait for >> > @@ -1170,19 +1172,23 @@ static void notify_ring(struct intel_engine_cs *engine) >> > * and to handle coalescing of multiple seqno updates >> > * and many waiters. >> > */ >> > - if (i915_seqno_passed(intel_engine_get_seqno(engine), >> > - wait->seqno)) { >> > + if (i915_seqno_passed(seqno, wait->seqno)) { >> > struct i915_request *waiter = wait->request; >> > >> > - wakeup = true; >> > if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, >> > &waiter->fence.flags) && >> > intel_wait_check_request(wait, waiter)) >> > rq = i915_request_get(waiter); >> > - } >> > >> > - if (wakeup) >> > - wake_up_process(wait->tsk); >> > + tsk = wait->tsk; >> > + } else { >> > + if (engine->irq_seqno_barrier && >> > + i915_seqno_passed(seqno, wait->seqno - 1)) { >> > + set_bit(ENGINE_IRQ_BREADCRUMB, >> > + &engine->irq_posted); >> > + tsk = wait->tsk; >> >> Hmm, you are optimistic that the latency of wakeup will be on par >> or greater than the next request completion? >> >> And wait side notices too that we are close and spins, >> instead of going back to sleep? > > Don't forget this is the missed breadcrumb mitigation for gen5-gen7, and > only for it. The intent is to keep that away from the paths that do not > need the extra delays. > Looks sensible optimization. But I would strip it out from this patch to it's own, to keep the content in sync with the commit message. -Mika
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 316d0b08d40f..53dad48f92ce 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1145,21 +1145,23 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv) static void notify_ring(struct intel_engine_cs *engine) { + const u32 seqno = intel_engine_get_seqno(engine); struct i915_request *rq = NULL; + struct task_struct *tsk = NULL; struct intel_wait *wait; - if (!engine->breadcrumbs.irq_armed) + if (unlikely(!engine->breadcrumbs.irq_armed)) return; atomic_inc(&engine->irq_count); - set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted); + + rcu_read_lock(); spin_lock(&engine->breadcrumbs.irq_lock); wait = engine->breadcrumbs.irq_wait; if (wait) { - bool wakeup = engine->irq_seqno_barrier; - - /* We use a callback from the dma-fence to submit + /* + * We use a callback from the dma-fence to submit * requests after waiting on our own requests. To * ensure minimum delay in queuing the next request to * hardware, signal the fence now rather than wait for @@ -1170,19 +1172,23 @@ static void notify_ring(struct intel_engine_cs *engine) * and to handle coalescing of multiple seqno updates * and many waiters. */ - if (i915_seqno_passed(intel_engine_get_seqno(engine), - wait->seqno)) { + if (i915_seqno_passed(seqno, wait->seqno)) { struct i915_request *waiter = wait->request; - wakeup = true; if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &waiter->fence.flags) && intel_wait_check_request(wait, waiter)) rq = i915_request_get(waiter); - } - if (wakeup) - wake_up_process(wait->tsk); + tsk = wait->tsk; + } else { + if (engine->irq_seqno_barrier && + i915_seqno_passed(seqno, wait->seqno - 1)) { + set_bit(ENGINE_IRQ_BREADCRUMB, + &engine->irq_posted); + tsk = wait->tsk; + } + } } else { if (engine->breadcrumbs.irq_armed) __intel_engine_disarm_breadcrumbs(engine); @@ -1195,6 +1201,11 @@ static void notify_ring(struct intel_engine_cs *engine) i915_request_put(rq); } + if (tsk && tsk->state & TASK_NORMAL) + wake_up_process(tsk); + + rcu_read_unlock(); + trace_intel_engine_notify(engine, wait); }
By taking advantage of the RCU protection of the task struct, we can find the appropriate signaler under the spinlock and then release the spinlock before waking the task and signaling the fence. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-)