Message ID | 20190814092643.1908-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/8] drm/i915/execlists: Lift process_csb() out of the irq-off spinlock | expand |
Chris Wilson <chris@chris-wilson.co.uk> writes: > If we only call process_csb() from the tasklet, though we lose the > ability to bypass ksoftirqd interrupt processing on direct submission > paths, we can push it out of the irq-off spinlock. > > The penalty is that we then allow schedule_out to be called concurrently > with schedule_in requiring us to handle the usage count (baked into the > pointer itself) atomically. > > As we do kick the tasklets (via local_bh_enable()) after our submission, > there is a possibility there to see if we can pull the local softirq > processing back from the ksoftirqd. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/gt/intel_context_types.h | 4 +- > drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- > drivers/gpu/drm/i915/gt/intel_lrc.c | 130 +++++++++++------- > drivers/gpu/drm/i915/i915_utils.h | 20 ++- > 4 files changed, 94 insertions(+), 62 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h > index a632b20ec4d8..d8ce266c049f 100644 > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > @@ -41,9 +41,7 @@ struct intel_context { > struct intel_engine_cs *engine; > struct intel_engine_cs *inflight; > #define intel_context_inflight(ce) ptr_mask_bits((ce)->inflight, 2) > -#define intel_context_inflight_count(ce) ptr_unmask_bits((ce)->inflight, 2) > -#define intel_context_inflight_inc(ce) ptr_count_inc(&(ce)->inflight) > -#define intel_context_inflight_dec(ce) ptr_count_dec(&(ce)->inflight) > +#define intel_context_inflight_count(ce) ptr_unmask_bits((ce)->inflight, 2) > > struct i915_address_space *vm; > struct i915_gem_context *gem_context; > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > index d20750e420c0..abd4fde2e52d 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > @@ -1460,7 +1460,7 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine) > > for (port = execlists->pending; (rq = *port); port++) { > /* Exclude any contexts already counted in active */ > - if (intel_context_inflight_count(rq->hw_context) == 1) > + if (!intel_context_inflight_count(rq->hw_context)) > engine->stats.active++; > } > > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > index 5c26c4ae139b..09d8cb8615cf 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > @@ -547,27 +547,39 @@ execlists_context_status_change(struct i915_request *rq, unsigned long status) > status, rq); > } > > +static inline struct intel_engine_cs * > +__execlists_schedule_in(struct i915_request *rq) > +{ > + struct intel_engine_cs * const engine = rq->engine; > + struct intel_context * const ce = rq->hw_context; > + > + intel_context_get(ce); > + > + intel_gt_pm_get(engine->gt); > + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); > + intel_engine_context_in(engine); > + > + return engine; > +} > + > static inline struct i915_request * > execlists_schedule_in(struct i915_request *rq, int idx) > { > - struct intel_context *ce = rq->hw_context; > - int count; > + struct intel_context * const ce = rq->hw_context; > + struct intel_engine_cs *old; > > + GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); > trace_i915_request_in(rq, idx); > > - count = intel_context_inflight_count(ce); > - if (!count) { > - intel_context_get(ce); > - ce->inflight = rq->engine; > - > - intel_gt_pm_get(ce->inflight->gt); > - execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); > - intel_engine_context_in(ce->inflight); > - } > + old = READ_ONCE(ce->inflight); > + do { > + if (!old) { The schedule out might have swapped inflight in here ruining our day. So I am here trying to figure out how you can pull it off. > + WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); > + break; > + } > + } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); > > - intel_context_inflight_inc(ce); > GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); > - > return i915_request_get(rq); > } > > @@ -581,35 +593,45 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce) > } > > static inline void > -execlists_schedule_out(struct i915_request *rq) > +__execlists_schedule_out(struct i915_request *rq, > + struct intel_engine_cs * const engine) > { > - struct intel_context *ce = rq->hw_context; > + struct intel_context * const ce = rq->hw_context; > > - GEM_BUG_ON(!intel_context_inflight_count(ce)); > + intel_engine_context_out(engine); > + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); > + intel_gt_pm_put(engine->gt); > > - trace_i915_request_out(rq); > + /* > + * If this is part of a virtual engine, its next request may > + * have been blocked waiting for access to the active context. > + * We have to kick all the siblings again in case we need to > + * switch (e.g. the next request is not runnable on this > + * engine). Hopefully, we will already have submitted the next > + * request before the tasklet runs and do not need to rebuild > + * each virtual tree and kick everyone again. > + */ > + if (ce->engine != engine) > + kick_siblings(rq, ce); > > - intel_context_inflight_dec(ce); > - if (!intel_context_inflight_count(ce)) { > - intel_engine_context_out(ce->inflight); > - execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); > - intel_gt_pm_put(ce->inflight->gt); > + intel_context_put(ce); > +} > > - /* > - * If this is part of a virtual engine, its next request may > - * have been blocked waiting for access to the active context. > - * We have to kick all the siblings again in case we need to > - * switch (e.g. the next request is not runnable on this > - * engine). Hopefully, we will already have submitted the next > - * request before the tasklet runs and do not need to rebuild > - * each virtual tree and kick everyone again. > - */ > - ce->inflight = NULL; > - if (rq->engine != ce->engine) > - kick_siblings(rq, ce); > +static inline void > +execlists_schedule_out(struct i915_request *rq) > +{ > + struct intel_context * const ce = rq->hw_context; > + struct intel_engine_cs *cur, *old; > > - intel_context_put(ce); > - } > + trace_i915_request_out(rq); > + GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); > + > + old = READ_ONCE(ce->inflight); > + do > + cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; > + while (!try_cmpxchg(&ce->inflight, &old, cur)); > + if (!cur) > + __execlists_schedule_out(rq, old); > > i915_request_put(rq); > } > @@ -684,6 +706,9 @@ assert_pending_valid(const struct intel_engine_execlists *execlists, > > trace_ports(execlists, msg, execlists->pending); > > + if (!execlists->pending[0]) > + return false; > + > if (execlists->pending[execlists_num_ports(execlists)]) > return false; > > @@ -944,9 +969,21 @@ need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) > static bool > enable_timeslice(struct intel_engine_cs *engine) > { > - struct i915_request *last = last_active(&engine->execlists); > + struct i915_request * const *port; > + int hint; > + > + port = engine->execlists.active; > + while (port[0] && i915_request_completed(port[0])) > + port++; > + if (!port[0]) > + return false; > > - return last && need_timeslice(engine, last); > + hint = engine->execlists.queue_priority_hint; > + if (port[1]) > + hint = max(rq_prio(port[1]), hint); > + > + /* Compare the two end-points as an unlocked approximation */ > + return hint >= effective_prio(port[0]); What happens if we get it wrong? > } > > static void record_preemption(struct intel_engine_execlists *execlists) > @@ -1356,7 +1393,6 @@ static void process_csb(struct intel_engine_cs *engine) > const u8 num_entries = execlists->csb_size; > u8 head, tail; > > - lockdep_assert_held(&engine->active.lock); > GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915)); > > /* > @@ -1427,15 +1463,14 @@ static void process_csb(struct intel_engine_cs *engine) > execlists->pending, > execlists_num_ports(execlists) * > sizeof(*execlists->pending)); > - execlists->pending[0] = NULL; > - > - trace_ports(execlists, "promoted", execlists->active); > > if (enable_timeslice(engine)) > mod_timer(&execlists->timer, jiffies + 1); > > if (!inject_preempt_hang(execlists)) > ring_set_paused(engine, 0); > + > + WRITE_ONCE(execlists->pending[0], NULL); > break; > > case CSB_COMPLETE: /* port0 completed, advanced to port1 */ > @@ -1479,8 +1514,6 @@ static void process_csb(struct intel_engine_cs *engine) > static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) > { > lockdep_assert_held(&engine->active.lock); > - > - process_csb(engine); > if (!engine->execlists.pending[0]) > execlists_dequeue(engine); > } > @@ -1494,9 +1527,12 @@ static void execlists_submission_tasklet(unsigned long data) > struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; > unsigned long flags; > > - spin_lock_irqsave(&engine->active.lock, flags); > - __execlists_submission_tasklet(engine); > - spin_unlock_irqrestore(&engine->active.lock, flags); > + process_csb(engine); > + if (!engine->execlists.pending[0]) { !READ_ONCE(...)? Would atleast pair documentatically. > + spin_lock_irqsave(&engine->active.lock, flags); > + __execlists_submission_tasklet(engine); > + spin_unlock_irqrestore(&engine->active.lock, flags); > + } > } > > static void execlists_submission_timer(struct timer_list *timer) > diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h > index d652ba5d2320..562f756da421 100644 > --- a/drivers/gpu/drm/i915/i915_utils.h > +++ b/drivers/gpu/drm/i915/i915_utils.h > @@ -161,17 +161,15 @@ __check_struct_size(size_t base, size_t arr, size_t count, size_t *size) > ((typeof(ptr))((unsigned long)(ptr) | __bits)); \ > }) > > -#define ptr_count_dec(p_ptr) do { \ > - typeof(p_ptr) __p = (p_ptr); \ > - unsigned long __v = (unsigned long)(*__p); \ > - *__p = (typeof(*p_ptr))(--__v); \ > -} while (0) > - > -#define ptr_count_inc(p_ptr) do { \ > - typeof(p_ptr) __p = (p_ptr); \ > - unsigned long __v = (unsigned long)(*__p); \ > - *__p = (typeof(*p_ptr))(++__v); \ > -} while (0) > +#define ptr_dec(ptr) ({ \ > + unsigned long __v = (unsigned long)(ptr); \ > + (typeof(ptr))(__v - 1); \ > +}) > + > +#define ptr_inc(ptr) ({ \ > + unsigned long __v = (unsigned long)(ptr); \ > + (typeof(ptr))(__v + 1); \ > +}) Should we add GEM_DEBUG_WARN_ON if we overflow or do we detect through other means? -Mika > > #define page_mask_bits(ptr) ptr_mask_bits(ptr, PAGE_SHIFT) > #define page_unmask_bits(ptr) ptr_unmask_bits(ptr, PAGE_SHIFT) > -- > 2.23.0.rc1
Quoting Mika Kuoppala (2019-08-16 08:50:29) > Chris Wilson <chris@chris-wilson.co.uk> writes: > > static inline struct i915_request * > > execlists_schedule_in(struct i915_request *rq, int idx) > > { > > - struct intel_context *ce = rq->hw_context; > > - int count; > > + struct intel_context * const ce = rq->hw_context; > > + struct intel_engine_cs *old; > > > > + GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); > > trace_i915_request_in(rq, idx); > > > > - count = intel_context_inflight_count(ce); > > - if (!count) { > > - intel_context_get(ce); > > - ce->inflight = rq->engine; > > - > > - intel_gt_pm_get(ce->inflight->gt); > > - execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); > > - intel_engine_context_in(ce->inflight); > > - } > > + old = READ_ONCE(ce->inflight); > > + do { > > + if (!old) { > > The schedule out might have swapped inflight in here ruining our day. > So I am here trying to figure out how you can pull it off. Once we _know_ the context is idle, the only way it can become busy again is via submitting a request under the engine->active.lock, which we hold. Note that schedule-out also has exclusive access by its caller (only one submission tasklet at a time), but schedule-out may run concurrently to schedule-in. (But once we idle a context in schedule-out, we will never see it again until a schedule-in, hence we know that upon seeing NULL we have exclusive access.) > > + WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); > > + break; > > + } > > + } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); > > > > - intel_context_inflight_inc(ce); > > GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); > > - > > return i915_request_get(rq); > > } > > enable_timeslice(struct intel_engine_cs *engine) > > { > > - struct i915_request *last = last_active(&engine->execlists); > > + struct i915_request * const *port; > > + int hint; > > + > > + port = engine->execlists.active; > > + while (port[0] && i915_request_completed(port[0])) > > + port++; > > + if (!port[0]) > > + return false; > > > > - return last && need_timeslice(engine, last); > > + hint = engine->execlists.queue_priority_hint; > > + if (port[1]) > > + hint = max(rq_prio(port[1]), hint); > > + > > + /* Compare the two end-points as an unlocked approximation */ > > + return hint >= effective_prio(port[0]); > > What happens if we get it wrong? So the last element in the next context is always the lowest priority (normally they are all the same priority). If there is a variation in priority along the requests in the second context, that may mask that the first request there should trigger a timeslice. Storing an execlists.switch_priority_hint should remove the ambiguity. > > @@ -1494,9 +1527,12 @@ static void execlists_submission_tasklet(unsigned long data) > > struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; > > unsigned long flags; > > > > - spin_lock_irqsave(&engine->active.lock, flags); > > - __execlists_submission_tasklet(engine); > > - spin_unlock_irqrestore(&engine->active.lock, flags); > > + process_csb(engine); > > + if (!engine->execlists.pending[0]) { > > !READ_ONCE(...)? Would atleast pair documentatically. How about if (process_csb()) { > > + spin_lock_irqsave(&engine->active.lock, flags); > > + __execlists_submission_tasklet(engine); > > + spin_unlock_irqrestore(&engine->active.lock, flags); > > + } > > } > > > > static void execlists_submission_timer(struct timer_list *timer) > > diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h > > index d652ba5d2320..562f756da421 100644 > > --- a/drivers/gpu/drm/i915/i915_utils.h > > +++ b/drivers/gpu/drm/i915/i915_utils.h > > @@ -161,17 +161,15 @@ __check_struct_size(size_t base, size_t arr, size_t count, size_t *size) > > ((typeof(ptr))((unsigned long)(ptr) | __bits)); \ > > }) > > > > -#define ptr_count_dec(p_ptr) do { \ > > - typeof(p_ptr) __p = (p_ptr); \ > > - unsigned long __v = (unsigned long)(*__p); \ > > - *__p = (typeof(*p_ptr))(--__v); \ > > -} while (0) > > - > > -#define ptr_count_inc(p_ptr) do { \ > > - typeof(p_ptr) __p = (p_ptr); \ > > - unsigned long __v = (unsigned long)(*__p); \ > > - *__p = (typeof(*p_ptr))(++__v); \ > > -} while (0) > > +#define ptr_dec(ptr) ({ \ > > + unsigned long __v = (unsigned long)(ptr); \ > > + (typeof(ptr))(__v - 1); \ > > +}) > > + > > +#define ptr_inc(ptr) ({ \ > > + unsigned long __v = (unsigned long)(ptr); \ > > + (typeof(ptr))(__v + 1); \ > > +}) > > Should we add GEM_DEBUG_WARN_ON if we overflow or do we > detect through other means? What's an overflow? What's the alignment of the target pointer? Perhaps the user intended that every 8 uses starts at the next location. That's not known at this basic level. And these are decidedly use-at-you-own-risk :) -Chris
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index a632b20ec4d8..d8ce266c049f 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -41,9 +41,7 @@ struct intel_context { struct intel_engine_cs *engine; struct intel_engine_cs *inflight; #define intel_context_inflight(ce) ptr_mask_bits((ce)->inflight, 2) -#define intel_context_inflight_count(ce) ptr_unmask_bits((ce)->inflight, 2) -#define intel_context_inflight_inc(ce) ptr_count_inc(&(ce)->inflight) -#define intel_context_inflight_dec(ce) ptr_count_dec(&(ce)->inflight) +#define intel_context_inflight_count(ce) ptr_unmask_bits((ce)->inflight, 2) struct i915_address_space *vm; struct i915_gem_context *gem_context; diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index d20750e420c0..abd4fde2e52d 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -1460,7 +1460,7 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine) for (port = execlists->pending; (rq = *port); port++) { /* Exclude any contexts already counted in active */ - if (intel_context_inflight_count(rq->hw_context) == 1) + if (!intel_context_inflight_count(rq->hw_context)) engine->stats.active++; } diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 5c26c4ae139b..09d8cb8615cf 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -547,27 +547,39 @@ execlists_context_status_change(struct i915_request *rq, unsigned long status) status, rq); } +static inline struct intel_engine_cs * +__execlists_schedule_in(struct i915_request *rq) +{ + struct intel_engine_cs * const engine = rq->engine; + struct intel_context * const ce = rq->hw_context; + + intel_context_get(ce); + + intel_gt_pm_get(engine->gt); + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); + intel_engine_context_in(engine); + + return engine; +} + static inline struct i915_request * execlists_schedule_in(struct i915_request *rq, int idx) { - struct intel_context *ce = rq->hw_context; - int count; + struct intel_context * const ce = rq->hw_context; + struct intel_engine_cs *old; + GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); trace_i915_request_in(rq, idx); - count = intel_context_inflight_count(ce); - if (!count) { - intel_context_get(ce); - ce->inflight = rq->engine; - - intel_gt_pm_get(ce->inflight->gt); - execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); - intel_engine_context_in(ce->inflight); - } + old = READ_ONCE(ce->inflight); + do { + if (!old) { + WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); + break; + } + } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); - intel_context_inflight_inc(ce); GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); - return i915_request_get(rq); } @@ -581,35 +593,45 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce) } static inline void -execlists_schedule_out(struct i915_request *rq) +__execlists_schedule_out(struct i915_request *rq, + struct intel_engine_cs * const engine) { - struct intel_context *ce = rq->hw_context; + struct intel_context * const ce = rq->hw_context; - GEM_BUG_ON(!intel_context_inflight_count(ce)); + intel_engine_context_out(engine); + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); + intel_gt_pm_put(engine->gt); - trace_i915_request_out(rq); + /* + * If this is part of a virtual engine, its next request may + * have been blocked waiting for access to the active context. + * We have to kick all the siblings again in case we need to + * switch (e.g. the next request is not runnable on this + * engine). Hopefully, we will already have submitted the next + * request before the tasklet runs and do not need to rebuild + * each virtual tree and kick everyone again. + */ + if (ce->engine != engine) + kick_siblings(rq, ce); - intel_context_inflight_dec(ce); - if (!intel_context_inflight_count(ce)) { - intel_engine_context_out(ce->inflight); - execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); - intel_gt_pm_put(ce->inflight->gt); + intel_context_put(ce); +} - /* - * If this is part of a virtual engine, its next request may - * have been blocked waiting for access to the active context. - * We have to kick all the siblings again in case we need to - * switch (e.g. the next request is not runnable on this - * engine). Hopefully, we will already have submitted the next - * request before the tasklet runs and do not need to rebuild - * each virtual tree and kick everyone again. - */ - ce->inflight = NULL; - if (rq->engine != ce->engine) - kick_siblings(rq, ce); +static inline void +execlists_schedule_out(struct i915_request *rq) +{ + struct intel_context * const ce = rq->hw_context; + struct intel_engine_cs *cur, *old; - intel_context_put(ce); - } + trace_i915_request_out(rq); + GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); + + old = READ_ONCE(ce->inflight); + do + cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; + while (!try_cmpxchg(&ce->inflight, &old, cur)); + if (!cur) + __execlists_schedule_out(rq, old); i915_request_put(rq); } @@ -684,6 +706,9 @@ assert_pending_valid(const struct intel_engine_execlists *execlists, trace_ports(execlists, msg, execlists->pending); + if (!execlists->pending[0]) + return false; + if (execlists->pending[execlists_num_ports(execlists)]) return false; @@ -944,9 +969,21 @@ need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) static bool enable_timeslice(struct intel_engine_cs *engine) { - struct i915_request *last = last_active(&engine->execlists); + struct i915_request * const *port; + int hint; + + port = engine->execlists.active; + while (port[0] && i915_request_completed(port[0])) + port++; + if (!port[0]) + return false; - return last && need_timeslice(engine, last); + hint = engine->execlists.queue_priority_hint; + if (port[1]) + hint = max(rq_prio(port[1]), hint); + + /* Compare the two end-points as an unlocked approximation */ + return hint >= effective_prio(port[0]); } static void record_preemption(struct intel_engine_execlists *execlists) @@ -1356,7 +1393,6 @@ static void process_csb(struct intel_engine_cs *engine) const u8 num_entries = execlists->csb_size; u8 head, tail; - lockdep_assert_held(&engine->active.lock); GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915)); /* @@ -1427,15 +1463,14 @@ static void process_csb(struct intel_engine_cs *engine) execlists->pending, execlists_num_ports(execlists) * sizeof(*execlists->pending)); - execlists->pending[0] = NULL; - - trace_ports(execlists, "promoted", execlists->active); if (enable_timeslice(engine)) mod_timer(&execlists->timer, jiffies + 1); if (!inject_preempt_hang(execlists)) ring_set_paused(engine, 0); + + WRITE_ONCE(execlists->pending[0], NULL); break; case CSB_COMPLETE: /* port0 completed, advanced to port1 */ @@ -1479,8 +1514,6 @@ static void process_csb(struct intel_engine_cs *engine) static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) { lockdep_assert_held(&engine->active.lock); - - process_csb(engine); if (!engine->execlists.pending[0]) execlists_dequeue(engine); } @@ -1494,9 +1527,12 @@ static void execlists_submission_tasklet(unsigned long data) struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; unsigned long flags; - spin_lock_irqsave(&engine->active.lock, flags); - __execlists_submission_tasklet(engine); - spin_unlock_irqrestore(&engine->active.lock, flags); + process_csb(engine); + if (!engine->execlists.pending[0]) { + spin_lock_irqsave(&engine->active.lock, flags); + __execlists_submission_tasklet(engine); + spin_unlock_irqrestore(&engine->active.lock, flags); + } } static void execlists_submission_timer(struct timer_list *timer) diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h index d652ba5d2320..562f756da421 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h @@ -161,17 +161,15 @@ __check_struct_size(size_t base, size_t arr, size_t count, size_t *size) ((typeof(ptr))((unsigned long)(ptr) | __bits)); \ }) -#define ptr_count_dec(p_ptr) do { \ - typeof(p_ptr) __p = (p_ptr); \ - unsigned long __v = (unsigned long)(*__p); \ - *__p = (typeof(*p_ptr))(--__v); \ -} while (0) - -#define ptr_count_inc(p_ptr) do { \ - typeof(p_ptr) __p = (p_ptr); \ - unsigned long __v = (unsigned long)(*__p); \ - *__p = (typeof(*p_ptr))(++__v); \ -} while (0) +#define ptr_dec(ptr) ({ \ + unsigned long __v = (unsigned long)(ptr); \ + (typeof(ptr))(__v - 1); \ +}) + +#define ptr_inc(ptr) ({ \ + unsigned long __v = (unsigned long)(ptr); \ + (typeof(ptr))(__v + 1); \ +}) #define page_mask_bits(ptr) ptr_mask_bits(ptr, PAGE_SHIFT) #define page_unmask_bits(ptr) ptr_unmask_bits(ptr, PAGE_SHIFT)
If we only call process_csb() from the tasklet, though we lose the ability to bypass ksoftirqd interrupt processing on direct submission paths, we can push it out of the irq-off spinlock. The penalty is that we then allow schedule_out to be called concurrently with schedule_in requiring us to handle the usage count (baked into the pointer itself) atomically. As we do kick the tasklets (via local_bh_enable()) after our submission, there is a possibility there to see if we can pull the local softirq processing back from the ksoftirqd. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/gt/intel_context_types.h | 4 +- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_lrc.c | 130 +++++++++++------- drivers/gpu/drm/i915/i915_utils.h | 20 ++- 4 files changed, 94 insertions(+), 62 deletions(-)