Message ID | 1464970133-29859-19-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 03/06/16 17:08, Chris Wilson wrote: > Under the assumption that enabling signaling will be a frequent > operation, lets preallocate our attachments for signaling inside the > request struct (and so benefiting from the slab cache). Oh you did this part which I suggested in the previous patch. :) > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_drv.h | 1 + > drivers/gpu/drm/i915/intel_breadcrumbs.c | 89 ++++++++++++++++++-------------- > drivers/gpu/drm/i915/intel_ringbuffer.h | 6 +++ > 3 files changed, 56 insertions(+), 40 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index b0235372cfdf..88d9242398ce 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2363,6 +2363,7 @@ struct drm_i915_gem_request { > struct drm_i915_private *i915; > struct intel_engine_cs *engine; > unsigned reset_counter; > + struct intel_signal_node signaling; > > /** GEM sequence number associated with the previous request, > * when the HWS breadcrumb is equal to this the GPU is processing > diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c > index 143891a2b68a..8ab508ed4248 100644 > --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c > +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c > @@ -128,16 +128,14 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b, > wake_up_process(wait->task); /* implicit smp_wmb() */ > } > > -bool intel_engine_add_wait(struct intel_engine_cs *engine, > - struct intel_wait *wait) > +static bool __intel_engine_add_wait(struct intel_engine_cs *engine, > + struct intel_wait *wait) > { > struct intel_breadcrumbs *b = &engine->breadcrumbs; > struct rb_node **p, *parent, *completed; > bool first; > u32 seqno; > > - spin_lock(&b->lock); > - > /* Insert the request into the retirement ordered list > * of waiters by walking the rbtree. If we are the oldest > * seqno in the tree (the first to be retired), then > @@ -223,6 +221,17 @@ bool intel_engine_add_wait(struct intel_engine_cs *engine, > GEM_BUG_ON(!b->first_wait); > GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node); > > + return first; > +} > + > +bool intel_engine_add_wait(struct intel_engine_cs *engine, > + struct intel_wait *wait) > +{ > + struct intel_breadcrumbs *b = &engine->breadcrumbs; > + bool first; > + > + spin_lock(&b->lock); > + first = __intel_engine_add_wait(engine, wait); > spin_unlock(&b->lock); > > return first; > @@ -323,35 +332,29 @@ out_unlock: > spin_unlock(&b->lock); > } > > -struct signal { > - struct rb_node node; > - struct intel_wait wait; > - struct drm_i915_gem_request *request; > -}; > - > -static bool signal_complete(struct signal *signal) > +static bool signal_complete(struct drm_i915_gem_request *request) > { > - if (signal == NULL) > + if (request == NULL) > return false; > > /* If another process served as the bottom-half it may have already > * signalled that this wait is already completed. > */ > - if (intel_wait_complete(&signal->wait)) > + if (intel_wait_complete(&request->signaling.wait)) > return true; > > /* Carefully check if the request is complete, giving time for the > * seqno to be visible or if the GPU hung. > */ > - if (__i915_request_irq_complete(signal->request)) > + if (__i915_request_irq_complete(request)) > return true; > > return false; > } > > -static struct signal *to_signal(struct rb_node *rb) > +static struct drm_i915_gem_request *to_signal(struct rb_node *rb) Why it is call to_signal then? > { > - return container_of(rb, struct signal, node); > + return container_of(rb, struct drm_i915_gem_request, signaling.node); > } > > static void signaler_set_rtpriority(void) > @@ -364,7 +367,7 @@ static int intel_breadcrumbs_signaler(void *arg) > { > struct intel_engine_cs *engine = arg; > struct intel_breadcrumbs *b = &engine->breadcrumbs; > - struct signal *signal; > + struct drm_i915_gem_request *request; > > /* Install ourselves with high priority to reduce signalling latency */ > signaler_set_rtpriority(); > @@ -380,14 +383,13 @@ static int intel_breadcrumbs_signaler(void *arg) > * need to wait for a new interrupt from the GPU or for > * a new client. > */ > - signal = READ_ONCE(b->first_signal); > - if (signal_complete(signal)) { > + request = READ_ONCE(b->first_signal); > + if (signal_complete(request)) { > /* Wake up all other completed waiters and select the > * next bottom-half for the next user interrupt. > */ > - intel_engine_remove_wait(engine, &signal->wait); > - > - i915_gem_request_unreference(signal->request); > + intel_engine_remove_wait(engine, > + &request->signaling.wait); > > /* Find the next oldest signal. Note that as we have > * not been holding the lock, another client may > @@ -396,12 +398,15 @@ static int intel_breadcrumbs_signaler(void *arg) > * the oldest before picking the next one. > */ > spin_lock(&b->lock); > - if (signal == b->first_signal) > - b->first_signal = rb_next(&signal->node); > - rb_erase(&signal->node, &b->signals); > + if (request == b->first_signal) { > + struct rb_node *rb = > + rb_next(&request->signaling.node); > + b->first_signal = rb ? to_signal(rb) : NULL; Made me look in the previous patch on how you didn't need to change the type for first_signal in this one. void* ! :) Please fix there. :) > + } > + rb_erase(&request->signaling.node, &b->signals); > spin_unlock(&b->lock); > > - kfree(signal); > + i915_gem_request_unreference(request); > } else { > if (kthread_should_stop()) > break; > @@ -418,20 +423,23 @@ int intel_engine_enable_signaling(struct drm_i915_gem_request *request) > struct intel_engine_cs *engine = request->engine; > struct intel_breadcrumbs *b = &engine->breadcrumbs; > struct rb_node *parent, **p; > - struct signal *signal; > bool first, wakeup; > > if (unlikely(IS_ERR(b->signaler))) > return PTR_ERR(b->signaler); > > - signal = kmalloc(sizeof(*signal), GFP_ATOMIC); > - if (unlikely(!signal)) > - return -ENOMEM; > + if (unlikely(READ_ONCE(request->signaling.wait.task))) > + return 0; Hmm it will depend on following patches whether this is safe. I don't like the explosion of READ_ONCE and smp_store_mb's in these patches. Something is bound to be broken. You even check it below under the lock. So I am not sure this optimisation is worth it. Maybe leave it for later? > > - signal->wait.task = b->signaler; > - signal->wait.seqno = request->seqno; > + spin_lock(&b->lock); > + if (unlikely(request->signaling.wait.task)) { > + wakeup = false; > + goto unlock; > + } > > - signal->request = i915_gem_request_reference(request); > + request->signaling.wait.task = b->signaler; > + request->signaling.wait.seqno = request->seqno; > + i915_gem_request_reference(request); > > /* First add ourselves into the list of waiters, but register our > * bottom-half as the signaller thread. As per usual, only the oldest > @@ -441,29 +449,30 @@ int intel_engine_enable_signaling(struct drm_i915_gem_request *request) > * If we are the oldest waiter, enable the irq (after which we > * must double check that the seqno did not complete). > */ > - wakeup = intel_engine_add_wait(engine, &signal->wait); > + wakeup = __intel_engine_add_wait(engine, &request->signaling.wait); > > /* Now insert ourselves into the retirement ordered list of signals > * on this engine. We track the oldest seqno as that will be the > * first signal to complete. > */ > - spin_lock(&b->lock); > parent = NULL; > first = true; > p = &b->signals.rb_node; > while (*p) { > parent = *p; > - if (i915_seqno_passed(signal->wait.seqno, > - to_signal(parent)->wait.seqno)) { > + if (i915_seqno_passed(request->seqno, > + to_signal(parent)->seqno)) { > p = &parent->rb_right; > first = false; > } else > p = &parent->rb_left; > } > - rb_link_node(&signal->node, parent, p); > - rb_insert_color(&signal->node, &b->signals); > + rb_link_node(&request->signaling.node, parent, p); > + rb_insert_color(&request->signaling.node, &b->signals); > if (first) > - smp_store_mb(b->first_signal, signal); > + smp_store_mb(b->first_signal, request); > + > +unlock: > spin_unlock(&b->lock); > > if (wakeup) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index f4bca38caef0..5f7cb3d0ea1c 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -530,6 +530,12 @@ struct intel_wait { > struct task_struct *task; > u32 seqno; > }; > + > +struct intel_signal_node { > + struct rb_node node; > + struct intel_wait wait; > +}; > + > void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine); > static inline void intel_wait_init(struct intel_wait *wait, u32 seqno) > { > Otherwise looks OK. Regards, Tvrtko
On Tue, Jun 07, 2016 at 01:31:25PM +0100, Tvrtko Ursulin wrote: > >@@ -418,20 +423,23 @@ int intel_engine_enable_signaling(struct drm_i915_gem_request *request) > > struct intel_engine_cs *engine = request->engine; > > struct intel_breadcrumbs *b = &engine->breadcrumbs; > > struct rb_node *parent, **p; > >- struct signal *signal; > > bool first, wakeup; > > > > if (unlikely(IS_ERR(b->signaler))) > > return PTR_ERR(b->signaler); > > > >- signal = kmalloc(sizeof(*signal), GFP_ATOMIC); > >- if (unlikely(!signal)) > >- return -ENOMEM; > >+ if (unlikely(READ_ONCE(request->signaling.wait.task))) > >+ return 0; > > Hmm it will depend on following patches whether this is safe. I > don't like the explosion of READ_ONCE and smp_store_mb's in these > patches. Something is bound to be broken. This one is trivial :) > You even check it below under the lock. So I am not sure this > optimisation is worth it. Maybe leave it for later? I was just worrying about contention since the breadcrumbs lock is being used to guard both trees and contention in the waiters is noticeable. -Chris
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index b0235372cfdf..88d9242398ce 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2363,6 +2363,7 @@ struct drm_i915_gem_request { struct drm_i915_private *i915; struct intel_engine_cs *engine; unsigned reset_counter; + struct intel_signal_node signaling; /** GEM sequence number associated with the previous request, * when the HWS breadcrumb is equal to this the GPU is processing diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c index 143891a2b68a..8ab508ed4248 100644 --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c @@ -128,16 +128,14 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b, wake_up_process(wait->task); /* implicit smp_wmb() */ } -bool intel_engine_add_wait(struct intel_engine_cs *engine, - struct intel_wait *wait) +static bool __intel_engine_add_wait(struct intel_engine_cs *engine, + struct intel_wait *wait) { struct intel_breadcrumbs *b = &engine->breadcrumbs; struct rb_node **p, *parent, *completed; bool first; u32 seqno; - spin_lock(&b->lock); - /* Insert the request into the retirement ordered list * of waiters by walking the rbtree. If we are the oldest * seqno in the tree (the first to be retired), then @@ -223,6 +221,17 @@ bool intel_engine_add_wait(struct intel_engine_cs *engine, GEM_BUG_ON(!b->first_wait); GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node); + return first; +} + +bool intel_engine_add_wait(struct intel_engine_cs *engine, + struct intel_wait *wait) +{ + struct intel_breadcrumbs *b = &engine->breadcrumbs; + bool first; + + spin_lock(&b->lock); + first = __intel_engine_add_wait(engine, wait); spin_unlock(&b->lock); return first; @@ -323,35 +332,29 @@ out_unlock: spin_unlock(&b->lock); } -struct signal { - struct rb_node node; - struct intel_wait wait; - struct drm_i915_gem_request *request; -}; - -static bool signal_complete(struct signal *signal) +static bool signal_complete(struct drm_i915_gem_request *request) { - if (signal == NULL) + if (request == NULL) return false; /* If another process served as the bottom-half it may have already * signalled that this wait is already completed. */ - if (intel_wait_complete(&signal->wait)) + if (intel_wait_complete(&request->signaling.wait)) return true; /* Carefully check if the request is complete, giving time for the * seqno to be visible or if the GPU hung. */ - if (__i915_request_irq_complete(signal->request)) + if (__i915_request_irq_complete(request)) return true; return false; } -static struct signal *to_signal(struct rb_node *rb) +static struct drm_i915_gem_request *to_signal(struct rb_node *rb) { - return container_of(rb, struct signal, node); + return container_of(rb, struct drm_i915_gem_request, signaling.node); } static void signaler_set_rtpriority(void) @@ -364,7 +367,7 @@ static int intel_breadcrumbs_signaler(void *arg) { struct intel_engine_cs *engine = arg; struct intel_breadcrumbs *b = &engine->breadcrumbs; - struct signal *signal; + struct drm_i915_gem_request *request; /* Install ourselves with high priority to reduce signalling latency */ signaler_set_rtpriority(); @@ -380,14 +383,13 @@ static int intel_breadcrumbs_signaler(void *arg) * need to wait for a new interrupt from the GPU or for * a new client. */ - signal = READ_ONCE(b->first_signal); - if (signal_complete(signal)) { + request = READ_ONCE(b->first_signal); + if (signal_complete(request)) { /* Wake up all other completed waiters and select the * next bottom-half for the next user interrupt. */ - intel_engine_remove_wait(engine, &signal->wait); - - i915_gem_request_unreference(signal->request); + intel_engine_remove_wait(engine, + &request->signaling.wait); /* Find the next oldest signal. Note that as we have * not been holding the lock, another client may @@ -396,12 +398,15 @@ static int intel_breadcrumbs_signaler(void *arg) * the oldest before picking the next one. */ spin_lock(&b->lock); - if (signal == b->first_signal) - b->first_signal = rb_next(&signal->node); - rb_erase(&signal->node, &b->signals); + if (request == b->first_signal) { + struct rb_node *rb = + rb_next(&request->signaling.node); + b->first_signal = rb ? to_signal(rb) : NULL; + } + rb_erase(&request->signaling.node, &b->signals); spin_unlock(&b->lock); - kfree(signal); + i915_gem_request_unreference(request); } else { if (kthread_should_stop()) break; @@ -418,20 +423,23 @@ int intel_engine_enable_signaling(struct drm_i915_gem_request *request) struct intel_engine_cs *engine = request->engine; struct intel_breadcrumbs *b = &engine->breadcrumbs; struct rb_node *parent, **p; - struct signal *signal; bool first, wakeup; if (unlikely(IS_ERR(b->signaler))) return PTR_ERR(b->signaler); - signal = kmalloc(sizeof(*signal), GFP_ATOMIC); - if (unlikely(!signal)) - return -ENOMEM; + if (unlikely(READ_ONCE(request->signaling.wait.task))) + return 0; - signal->wait.task = b->signaler; - signal->wait.seqno = request->seqno; + spin_lock(&b->lock); + if (unlikely(request->signaling.wait.task)) { + wakeup = false; + goto unlock; + } - signal->request = i915_gem_request_reference(request); + request->signaling.wait.task = b->signaler; + request->signaling.wait.seqno = request->seqno; + i915_gem_request_reference(request); /* First add ourselves into the list of waiters, but register our * bottom-half as the signaller thread. As per usual, only the oldest @@ -441,29 +449,30 @@ int intel_engine_enable_signaling(struct drm_i915_gem_request *request) * If we are the oldest waiter, enable the irq (after which we * must double check that the seqno did not complete). */ - wakeup = intel_engine_add_wait(engine, &signal->wait); + wakeup = __intel_engine_add_wait(engine, &request->signaling.wait); /* Now insert ourselves into the retirement ordered list of signals * on this engine. We track the oldest seqno as that will be the * first signal to complete. */ - spin_lock(&b->lock); parent = NULL; first = true; p = &b->signals.rb_node; while (*p) { parent = *p; - if (i915_seqno_passed(signal->wait.seqno, - to_signal(parent)->wait.seqno)) { + if (i915_seqno_passed(request->seqno, + to_signal(parent)->seqno)) { p = &parent->rb_right; first = false; } else p = &parent->rb_left; } - rb_link_node(&signal->node, parent, p); - rb_insert_color(&signal->node, &b->signals); + rb_link_node(&request->signaling.node, parent, p); + rb_insert_color(&request->signaling.node, &b->signals); if (first) - smp_store_mb(b->first_signal, signal); + smp_store_mb(b->first_signal, request); + +unlock: spin_unlock(&b->lock); if (wakeup) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index f4bca38caef0..5f7cb3d0ea1c 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -530,6 +530,12 @@ struct intel_wait { struct task_struct *task; u32 seqno; }; + +struct intel_signal_node { + struct rb_node node; + struct intel_wait wait; +}; + void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine); static inline void intel_wait_init(struct intel_wait *wait, u32 seqno) {
Under the assumption that enabling signaling will be a frequent operation, lets preallocate our attachments for signaling inside the request struct (and so benefiting from the slab cache). Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/intel_breadcrumbs.c | 89 ++++++++++++++++++-------------- drivers/gpu/drm/i915/intel_ringbuffer.h | 6 +++ 3 files changed, 56 insertions(+), 40 deletions(-)