Message ID | 20200730093756.16737-17-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/21] drm/i915: Add a couple of missing i915_active_fini() | expand |
On 30/07/2020 10:37, Chris Wilson wrote: > Make b->signaled_requests a lockless-list so that we can manipulate it > outside of the b->irq_lock. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 30 +++++++++++-------- > .../gpu/drm/i915/gt/intel_breadcrumbs_types.h | 2 +- > drivers/gpu/drm/i915/i915_request.h | 6 +++- > 3 files changed, 23 insertions(+), 15 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c > index fc6f0223d2c8..6a278bf0fc6b 100644 > --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c > +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c > @@ -174,16 +174,13 @@ static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl) > intel_engine_add_retire(b->irq_engine, tl); > } > > -static bool __signal_request(struct i915_request *rq, struct list_head *signals) > +static bool __signal_request(struct i915_request *rq) > { > - clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); > - > if (!__dma_fence_signal(&rq->fence)) { > i915_request_put(rq); > return false; > } > > - list_add_tail(&rq->signal_link, signals); > return true; > } > > @@ -191,17 +188,19 @@ static void signal_irq_work(struct irq_work *work) > { > struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work); > const ktime_t timestamp = ktime_get(); > + struct llist_node *signal, *sn; > struct intel_context *ce, *cn; > struct list_head *pos, *next; > - LIST_HEAD(signal); > + > + signal = NULL; > + if (unlikely(!llist_empty(&b->signaled_requests))) > + signal = llist_del_all(&b->signaled_requests); > > spin_lock(&b->irq_lock); > > - if (list_empty(&b->signalers)) > + if (!signal && list_empty(&b->signalers)) The only open from previous round was on this change. If I understood your previous reply correctly, checking this or not simply controls the disarm point and is not related to this patch. With the check added now we would disarm later, because even already signaled requests would keep it armed. I would prefer this was a separate patch if you could possibly be convinced. Regards, Tvrtko > __intel_breadcrumbs_disarm_irq(b); > > - list_splice_init(&b->signaled_requests, &signal); > - > list_for_each_entry_safe(ce, cn, &b->signalers, signal_link) { > GEM_BUG_ON(list_empty(&ce->signals)); > > @@ -218,7 +217,11 @@ static void signal_irq_work(struct irq_work *work) > * spinlock as the callback chain may end up adding > * more signalers to the same context or engine. > */ > - __signal_request(rq, &signal); > + clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); > + if (__signal_request(rq)) { > + rq->signal_node.next = signal; > + signal = &rq->signal_node; > + } > } > > /* > @@ -238,9 +241,9 @@ static void signal_irq_work(struct irq_work *work) > > spin_unlock(&b->irq_lock); > > - list_for_each_safe(pos, next, &signal) { > + llist_for_each_safe(signal, sn, signal) { > struct i915_request *rq = > - list_entry(pos, typeof(*rq), signal_link); > + llist_entry(signal, typeof(*rq), signal_node); > struct list_head cb_list; > > spin_lock(&rq->lock); > @@ -264,7 +267,7 @@ intel_breadcrumbs_create(struct intel_engine_cs *irq_engine) > > spin_lock_init(&b->irq_lock); > INIT_LIST_HEAD(&b->signalers); > - INIT_LIST_HEAD(&b->signaled_requests); > + init_llist_head(&b->signaled_requests); > > init_irq_work(&b->irq_work, signal_irq_work); > > @@ -327,7 +330,8 @@ static void insert_breadcrumb(struct i915_request *rq, > * its signal completion. > */ > if (__request_completed(rq)) { > - if (__signal_request(rq, &b->signaled_requests)) > + if (__signal_request(rq) && > + llist_add(&rq->signal_node, &b->signaled_requests)) > irq_work_queue(&b->irq_work); > return; > } > diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h > index 8e53b9942695..3fa19820b37a 100644 > --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h > @@ -35,7 +35,7 @@ struct intel_breadcrumbs { > struct intel_engine_cs *irq_engine; > > struct list_head signalers; > - struct list_head signaled_requests; > + struct llist_head signaled_requests; > > struct irq_work irq_work; /* for use from inside irq_lock */ > > diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h > index 16b721080195..874af6db6103 100644 > --- a/drivers/gpu/drm/i915/i915_request.h > +++ b/drivers/gpu/drm/i915/i915_request.h > @@ -176,7 +176,11 @@ struct i915_request { > struct intel_context *context; > struct intel_ring *ring; > struct intel_timeline __rcu *timeline; > - struct list_head signal_link; > + > + union { > + struct list_head signal_link; > + struct llist_node signal_node; > + }; > > /* > * The rcu epoch of when this request was allocated. Used to judiciously >
Quoting Tvrtko Ursulin (2020-07-31 16:06:55) > > On 30/07/2020 10:37, Chris Wilson wrote: > > @@ -191,17 +188,19 @@ static void signal_irq_work(struct irq_work *work) > > { > > struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work); > > const ktime_t timestamp = ktime_get(); > > + struct llist_node *signal, *sn; > > struct intel_context *ce, *cn; > > struct list_head *pos, *next; > > - LIST_HEAD(signal); > > + > > + signal = NULL; > > + if (unlikely(!llist_empty(&b->signaled_requests))) > > + signal = llist_del_all(&b->signaled_requests); > > > > spin_lock(&b->irq_lock); > > > > - if (list_empty(&b->signalers)) > > + if (!signal && list_empty(&b->signalers)) > > The only open from previous round was on this change. If I understood > your previous reply correctly, checking this or not simply controls the > disarm point and is not related to this patch. With the check added now > we would disarm later, because even already signaled requests would keep > it armed. I would prefer this was a separate patch if you could possibly > be convinced. I considered that since we add to the lockless list and then queue the irq work, that is a path that is not driven by the interrupt and so causing an issue with the idea of the interrupt shadow. Having a simple test I thought was a positive side-effect to filter out the early irq_work. -Chris
Quoting Chris Wilson (2020-07-31 16:12:32) > Quoting Tvrtko Ursulin (2020-07-31 16:06:55) > > > > On 30/07/2020 10:37, Chris Wilson wrote: > > > @@ -191,17 +188,19 @@ static void signal_irq_work(struct irq_work *work) > > > { > > > struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work); > > > const ktime_t timestamp = ktime_get(); > > > + struct llist_node *signal, *sn; > > > struct intel_context *ce, *cn; > > > struct list_head *pos, *next; > > > - LIST_HEAD(signal); > > > + > > > + signal = NULL; > > > + if (unlikely(!llist_empty(&b->signaled_requests))) > > > + signal = llist_del_all(&b->signaled_requests); > > > > > > spin_lock(&b->irq_lock); > > > > > > - if (list_empty(&b->signalers)) > > > + if (!signal && list_empty(&b->signalers)) > > > > The only open from previous round was on this change. If I understood > > your previous reply correctly, checking this or not simply controls the > > disarm point and is not related to this patch. With the check added now > > we would disarm later, because even already signaled requests would keep > > it armed. I would prefer this was a separate patch if you could possibly > > be convinced. > > I considered that since we add to the lockless list and then queue the > irq work, that is a path that is not driven by the interrupt and so > causing an issue with the idea of the interrupt shadow. Having a simple > test I thought was a positive side-effect to filter out the early > irq_work. How about a compromise and I sell the patch with a comment: /* * Keep the irq armed until the interrupt after all listeners are gone. * * Enabling/disabling the interrupt is rather costly, roughly a couple * of hundred microseconds. If we are proactive and enable/disable * the interrupt around every request that wants a breadcrumb, we * quickly drown in the extra orders of magnitude of latency imposed * on request submission. * * So we try to be lazy, and keep the interrupts enabled until no * more listeners appear within a breadcrumb interrupt interval (that * is until a request completes that no one cares about). The * observation is that listeners come in batches, and will often * listen to a bunch of requests in succession. * * We also try to avoid raising too many interrupts, as they may * be generated by userspace batches and it is unfortunately rather * too easy to drown the CPU under a flood of GPU interrupts. Thus * whenever no one appears to be listening, we turn off the interrupts. * Fewer interrupts should conserve power -- at the very least, fewer * interrupt draw less ire from other users of the system and tools * like powertop. */ -Chris
On 31/07/2020 16:12, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2020-07-31 16:06:55) >> >> On 30/07/2020 10:37, Chris Wilson wrote: >>> @@ -191,17 +188,19 @@ static void signal_irq_work(struct irq_work *work) >>> { >>> struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work); >>> const ktime_t timestamp = ktime_get(); >>> + struct llist_node *signal, *sn; >>> struct intel_context *ce, *cn; >>> struct list_head *pos, *next; >>> - LIST_HEAD(signal); >>> + >>> + signal = NULL; >>> + if (unlikely(!llist_empty(&b->signaled_requests))) >>> + signal = llist_del_all(&b->signaled_requests); >>> >>> spin_lock(&b->irq_lock); >>> >>> - if (list_empty(&b->signalers)) >>> + if (!signal && list_empty(&b->signalers)) >> >> The only open from previous round was on this change. If I understood >> your previous reply correctly, checking this or not simply controls the >> disarm point and is not related to this patch. With the check added now >> we would disarm later, because even already signaled requests would keep >> it armed. I would prefer this was a separate patch if you could possibly >> be convinced. > > I considered that since we add to the lockless list and then queue the > irq work, that is a path that is not driven by the interrupt and so > causing an issue with the idea of the interrupt shadow. Having a simple > test I thought was a positive side-effect to filter out the early > irq_work. I don't really follow. I look at it like this: No active signalers so we can disarm. What is the purpose of keeping the interrupt enabled if all that is on list are already completed requests? They will get signaled in the very same run of signal_irq_work so I don't see a connection with lockless list and keeping the interrupts on for longer. Regards, Tvrtko
On 31/07/2020 16:21, Chris Wilson wrote: > Quoting Chris Wilson (2020-07-31 16:12:32) >> Quoting Tvrtko Ursulin (2020-07-31 16:06:55) >>> >>> On 30/07/2020 10:37, Chris Wilson wrote: >>>> @@ -191,17 +188,19 @@ static void signal_irq_work(struct irq_work *work) >>>> { >>>> struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work); >>>> const ktime_t timestamp = ktime_get(); >>>> + struct llist_node *signal, *sn; >>>> struct intel_context *ce, *cn; >>>> struct list_head *pos, *next; >>>> - LIST_HEAD(signal); >>>> + >>>> + signal = NULL; >>>> + if (unlikely(!llist_empty(&b->signaled_requests))) >>>> + signal = llist_del_all(&b->signaled_requests); >>>> >>>> spin_lock(&b->irq_lock); >>>> >>>> - if (list_empty(&b->signalers)) >>>> + if (!signal && list_empty(&b->signalers)) >>> >>> The only open from previous round was on this change. If I understood >>> your previous reply correctly, checking this or not simply controls the >>> disarm point and is not related to this patch. With the check added now >>> we would disarm later, because even already signaled requests would keep >>> it armed. I would prefer this was a separate patch if you could possibly >>> be convinced. >> >> I considered that since we add to the lockless list and then queue the >> irq work, that is a path that is not driven by the interrupt and so >> causing an issue with the idea of the interrupt shadow. Having a simple >> test I thought was a positive side-effect to filter out the early >> irq_work. > > How about a compromise and I sell the patch with a comment: > /* > * Keep the irq armed until the interrupt after all listeners are gone. > * > * Enabling/disabling the interrupt is rather costly, roughly a couple > * of hundred microseconds. If we are proactive and enable/disable > * the interrupt around every request that wants a breadcrumb, we > * quickly drown in the extra orders of magnitude of latency imposed > * on request submission. > * > * So we try to be lazy, and keep the interrupts enabled until no > * more listeners appear within a breadcrumb interrupt interval (that > * is until a request completes that no one cares about). The > * observation is that listeners come in batches, and will often > * listen to a bunch of requests in succession. > * > * We also try to avoid raising too many interrupts, as they may > * be generated by userspace batches and it is unfortunately rather > * too easy to drown the CPU under a flood of GPU interrupts. Thus > * whenever no one appears to be listening, we turn off the interrupts. > * Fewer interrupts should conserve power -- at the very least, fewer > * interrupt draw less ire from other users of the system and tools > * like powertop. > */ It really feels like it should be a separate patch. I am not sure at the moment how exactly the hysteresis this would apply would look like. The end point is driven by the requests on the signaled list, but that is also driven by the timing of listeners adding themselves versus the request completion. For instance maybe if we want a hysteresis we won't something more deterministic and explicit. Maybe tied directly to the user interrupt following no more listeners. Like simply disarm on the second irq work after all b->signalers have gone. I just can't picture the state or b->signaled_requests in relation to all dynamic actions. Regards, Tvrtko
Quoting Tvrtko Ursulin (2020-07-31 17:06:08) > > On 31/07/2020 16:21, Chris Wilson wrote: > > Quoting Chris Wilson (2020-07-31 16:12:32) > >> Quoting Tvrtko Ursulin (2020-07-31 16:06:55) > >>> > >>> On 30/07/2020 10:37, Chris Wilson wrote: > >>>> @@ -191,17 +188,19 @@ static void signal_irq_work(struct irq_work *work) > >>>> { > >>>> struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work); > >>>> const ktime_t timestamp = ktime_get(); > >>>> + struct llist_node *signal, *sn; > >>>> struct intel_context *ce, *cn; > >>>> struct list_head *pos, *next; > >>>> - LIST_HEAD(signal); > >>>> + > >>>> + signal = NULL; > >>>> + if (unlikely(!llist_empty(&b->signaled_requests))) > >>>> + signal = llist_del_all(&b->signaled_requests); > >>>> > >>>> spin_lock(&b->irq_lock); > >>>> > >>>> - if (list_empty(&b->signalers)) > >>>> + if (!signal && list_empty(&b->signalers)) > >>> > >>> The only open from previous round was on this change. If I understood > >>> your previous reply correctly, checking this or not simply controls the > >>> disarm point and is not related to this patch. With the check added now > >>> we would disarm later, because even already signaled requests would keep > >>> it armed. I would prefer this was a separate patch if you could possibly > >>> be convinced. > >> > >> I considered that since we add to the lockless list and then queue the > >> irq work, that is a path that is not driven by the interrupt and so > >> causing an issue with the idea of the interrupt shadow. Having a simple > >> test I thought was a positive side-effect to filter out the early > >> irq_work. > > > > How about a compromise and I sell the patch with a comment: > > /* > > * Keep the irq armed until the interrupt after all listeners are gone. > > * > > * Enabling/disabling the interrupt is rather costly, roughly a couple > > * of hundred microseconds. If we are proactive and enable/disable > > * the interrupt around every request that wants a breadcrumb, we > > * quickly drown in the extra orders of magnitude of latency imposed > > * on request submission. > > * > > * So we try to be lazy, and keep the interrupts enabled until no > > * more listeners appear within a breadcrumb interrupt interval (that > > * is until a request completes that no one cares about). The > > * observation is that listeners come in batches, and will often > > * listen to a bunch of requests in succession. > > * > > * We also try to avoid raising too many interrupts, as they may > > * be generated by userspace batches and it is unfortunately rather > > * too easy to drown the CPU under a flood of GPU interrupts. Thus > > * whenever no one appears to be listening, we turn off the interrupts. > > * Fewer interrupts should conserve power -- at the very least, fewer > > * interrupt draw less ire from other users of the system and tools > > * like powertop. > > */ > > It really feels like it should be a separate patch. > > I am not sure at the moment how exactly the hysteresis this would apply > would look like. The end point is driven by the requests on the signaled > list, but that is also driven by the timing of listeners adding > themselves versus the request completion. For instance maybe if we want > a hysteresis we won't something more deterministic and explicit. Maybe > tied directly to the user interrupt following no more listeners. Like > simply disarm on the second irq work after all b->signalers have gone. I > just can't picture the state or b->signaled_requests in relation to all > dynamic actions. Which is kind of the point. b->signaled_requests doesn't have a relationship with the interrupt delivery. So we are changing the behaviour by kicking the irq_work after adding to b->signaled_requests independently of the interrupts, hence why we shouldn't be making that change incidently in the patch. It is about conserving existing behaviour. -Chris
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index fc6f0223d2c8..6a278bf0fc6b 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -174,16 +174,13 @@ static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl) intel_engine_add_retire(b->irq_engine, tl); } -static bool __signal_request(struct i915_request *rq, struct list_head *signals) +static bool __signal_request(struct i915_request *rq) { - clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); - if (!__dma_fence_signal(&rq->fence)) { i915_request_put(rq); return false; } - list_add_tail(&rq->signal_link, signals); return true; } @@ -191,17 +188,19 @@ static void signal_irq_work(struct irq_work *work) { struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work); const ktime_t timestamp = ktime_get(); + struct llist_node *signal, *sn; struct intel_context *ce, *cn; struct list_head *pos, *next; - LIST_HEAD(signal); + + signal = NULL; + if (unlikely(!llist_empty(&b->signaled_requests))) + signal = llist_del_all(&b->signaled_requests); spin_lock(&b->irq_lock); - if (list_empty(&b->signalers)) + if (!signal && list_empty(&b->signalers)) __intel_breadcrumbs_disarm_irq(b); - list_splice_init(&b->signaled_requests, &signal); - list_for_each_entry_safe(ce, cn, &b->signalers, signal_link) { GEM_BUG_ON(list_empty(&ce->signals)); @@ -218,7 +217,11 @@ static void signal_irq_work(struct irq_work *work) * spinlock as the callback chain may end up adding * more signalers to the same context or engine. */ - __signal_request(rq, &signal); + clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); + if (__signal_request(rq)) { + rq->signal_node.next = signal; + signal = &rq->signal_node; + } } /* @@ -238,9 +241,9 @@ static void signal_irq_work(struct irq_work *work) spin_unlock(&b->irq_lock); - list_for_each_safe(pos, next, &signal) { + llist_for_each_safe(signal, sn, signal) { struct i915_request *rq = - list_entry(pos, typeof(*rq), signal_link); + llist_entry(signal, typeof(*rq), signal_node); struct list_head cb_list; spin_lock(&rq->lock); @@ -264,7 +267,7 @@ intel_breadcrumbs_create(struct intel_engine_cs *irq_engine) spin_lock_init(&b->irq_lock); INIT_LIST_HEAD(&b->signalers); - INIT_LIST_HEAD(&b->signaled_requests); + init_llist_head(&b->signaled_requests); init_irq_work(&b->irq_work, signal_irq_work); @@ -327,7 +330,8 @@ static void insert_breadcrumb(struct i915_request *rq, * its signal completion. */ if (__request_completed(rq)) { - if (__signal_request(rq, &b->signaled_requests)) + if (__signal_request(rq) && + llist_add(&rq->signal_node, &b->signaled_requests)) irq_work_queue(&b->irq_work); return; } diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h index 8e53b9942695..3fa19820b37a 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h @@ -35,7 +35,7 @@ struct intel_breadcrumbs { struct intel_engine_cs *irq_engine; struct list_head signalers; - struct list_head signaled_requests; + struct llist_head signaled_requests; struct irq_work irq_work; /* for use from inside irq_lock */ diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 16b721080195..874af6db6103 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -176,7 +176,11 @@ struct i915_request { struct intel_context *context; struct intel_ring *ring; struct intel_timeline __rcu *timeline; - struct list_head signal_link; + + union { + struct list_head signal_link; + struct llist_node signal_node; + }; /* * The rcu epoch of when this request was allocated. Used to judiciously
Make b->signaled_requests a lockless-list so that we can manipulate it outside of the b->irq_lock. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 30 +++++++++++-------- .../gpu/drm/i915/gt/intel_breadcrumbs_types.h | 2 +- drivers/gpu/drm/i915/i915_request.h | 6 +++- 3 files changed, 23 insertions(+), 15 deletions(-)