diff mbox series

[1/8] drm/i915/execlists: Lift process_csb() out of the irq-off spinlock

Message ID 20190814092643.1908-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [1/8] drm/i915/execlists: Lift process_csb() out of the irq-off spinlock | expand

Commit Message

Chris Wilson Aug. 14, 2019, 9:26 a.m. UTC
If we only call process_csb() from the tasklet, though we lose the
ability to bypass ksoftirqd interrupt processing on direct submission
paths, we can push it out of the irq-off spinlock.

The penalty is that we then allow schedule_out to be called concurrently
with schedule_in requiring us to handle the usage count (baked into the
pointer itself) atomically.

As we do kick the tasklets (via local_bh_enable()) after our submission,
there is a possibility there to see if we can pull the local softirq
processing back from the ksoftirqd.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_context_types.h |   4 +-
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |   2 +-
 drivers/gpu/drm/i915/gt/intel_lrc.c           | 130 +++++++++++-------
 drivers/gpu/drm/i915/i915_utils.h             |  20 ++-
 4 files changed, 94 insertions(+), 62 deletions(-)

Comments

Mika Kuoppala Aug. 16, 2019, 7:50 a.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> If we only call process_csb() from the tasklet, though we lose the
> ability to bypass ksoftirqd interrupt processing on direct submission
> paths, we can push it out of the irq-off spinlock.
>
> The penalty is that we then allow schedule_out to be called concurrently
> with schedule_in requiring us to handle the usage count (baked into the
> pointer itself) atomically.
>
> As we do kick the tasklets (via local_bh_enable()) after our submission,
> there is a possibility there to see if we can pull the local softirq
> processing back from the ksoftirqd.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/gt/intel_context_types.h |   4 +-
>  drivers/gpu/drm/i915/gt/intel_engine_cs.c     |   2 +-
>  drivers/gpu/drm/i915/gt/intel_lrc.c           | 130 +++++++++++-------
>  drivers/gpu/drm/i915/i915_utils.h             |  20 ++-
>  4 files changed, 94 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index a632b20ec4d8..d8ce266c049f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -41,9 +41,7 @@ struct intel_context {
>  	struct intel_engine_cs *engine;
>  	struct intel_engine_cs *inflight;
>  #define intel_context_inflight(ce) ptr_mask_bits((ce)->inflight, 2)
> -#define intel_context_inflight_count(ce)  ptr_unmask_bits((ce)->inflight, 2)
> -#define intel_context_inflight_inc(ce) ptr_count_inc(&(ce)->inflight)
> -#define intel_context_inflight_dec(ce) ptr_count_dec(&(ce)->inflight)
> +#define intel_context_inflight_count(ce) ptr_unmask_bits((ce)->inflight, 2)
>  
>  	struct i915_address_space *vm;
>  	struct i915_gem_context *gem_context;
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index d20750e420c0..abd4fde2e52d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -1460,7 +1460,7 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine)
>  
>  		for (port = execlists->pending; (rq = *port); port++) {
>  			/* Exclude any contexts already counted in active */
> -			if (intel_context_inflight_count(rq->hw_context) == 1)
> +			if (!intel_context_inflight_count(rq->hw_context))
>  				engine->stats.active++;
>  		}
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 5c26c4ae139b..09d8cb8615cf 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -547,27 +547,39 @@ execlists_context_status_change(struct i915_request *rq, unsigned long status)
>  				   status, rq);
>  }
>  
> +static inline struct intel_engine_cs *
> +__execlists_schedule_in(struct i915_request *rq)
> +{
> +	struct intel_engine_cs * const engine = rq->engine;
> +	struct intel_context * const ce = rq->hw_context;
> +
> +	intel_context_get(ce);
> +
> +	intel_gt_pm_get(engine->gt);
> +	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> +	intel_engine_context_in(engine);
> +
> +	return engine;
> +}
> +
>  static inline struct i915_request *
>  execlists_schedule_in(struct i915_request *rq, int idx)
>  {
> -	struct intel_context *ce = rq->hw_context;
> -	int count;
> +	struct intel_context * const ce = rq->hw_context;
> +	struct intel_engine_cs *old;
>  
> +	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
>  	trace_i915_request_in(rq, idx);
>  
> -	count = intel_context_inflight_count(ce);
> -	if (!count) {
> -		intel_context_get(ce);
> -		ce->inflight = rq->engine;
> -
> -		intel_gt_pm_get(ce->inflight->gt);
> -		execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> -		intel_engine_context_in(ce->inflight);
> -	}
> +	old = READ_ONCE(ce->inflight);
> +	do {
> +		if (!old) {

The schedule out might have swapped inflight in here ruining our day.
So I am here trying to figure out how you can pull it off.

> +			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
> +			break;
> +		}
> +	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
>  
> -	intel_context_inflight_inc(ce);
>  	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> -
>  	return i915_request_get(rq);
>  }
>  
> @@ -581,35 +593,45 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
>  }
>  
>  static inline void
> -execlists_schedule_out(struct i915_request *rq)
> +__execlists_schedule_out(struct i915_request *rq,
> +			 struct intel_engine_cs * const engine)
>  {
> -	struct intel_context *ce = rq->hw_context;
> +	struct intel_context * const ce = rq->hw_context;
>  
> -	GEM_BUG_ON(!intel_context_inflight_count(ce));
> +	intel_engine_context_out(engine);
> +	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> +	intel_gt_pm_put(engine->gt);
>  
> -	trace_i915_request_out(rq);
> +	/*
> +	 * If this is part of a virtual engine, its next request may
> +	 * have been blocked waiting for access to the active context.
> +	 * We have to kick all the siblings again in case we need to
> +	 * switch (e.g. the next request is not runnable on this
> +	 * engine). Hopefully, we will already have submitted the next
> +	 * request before the tasklet runs and do not need to rebuild
> +	 * each virtual tree and kick everyone again.
> +	 */
> +	if (ce->engine != engine)
> +		kick_siblings(rq, ce);
>  
> -	intel_context_inflight_dec(ce);
> -	if (!intel_context_inflight_count(ce)) {
> -		intel_engine_context_out(ce->inflight);
> -		execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> -		intel_gt_pm_put(ce->inflight->gt);
> +	intel_context_put(ce);
> +}
>  
> -		/*
> -		 * If this is part of a virtual engine, its next request may
> -		 * have been blocked waiting for access to the active context.
> -		 * We have to kick all the siblings again in case we need to
> -		 * switch (e.g. the next request is not runnable on this
> -		 * engine). Hopefully, we will already have submitted the next
> -		 * request before the tasklet runs and do not need to rebuild
> -		 * each virtual tree and kick everyone again.
> -		 */
> -		ce->inflight = NULL;
> -		if (rq->engine != ce->engine)
> -			kick_siblings(rq, ce);
> +static inline void
> +execlists_schedule_out(struct i915_request *rq)
> +{
> +	struct intel_context * const ce = rq->hw_context;
> +	struct intel_engine_cs *cur, *old;
>  
> -		intel_context_put(ce);
> -	}
> +	trace_i915_request_out(rq);
> +	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> +
> +	old = READ_ONCE(ce->inflight);
> +	do
> +		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
> +	while (!try_cmpxchg(&ce->inflight, &old, cur));
> +	if (!cur)
> +		__execlists_schedule_out(rq, old);
>  
>  	i915_request_put(rq);
>  }
> @@ -684,6 +706,9 @@ assert_pending_valid(const struct intel_engine_execlists *execlists,
>  
>  	trace_ports(execlists, msg, execlists->pending);
>  
> +	if (!execlists->pending[0])
> +		return false;
> +
>  	if (execlists->pending[execlists_num_ports(execlists)])
>  		return false;
>  
> @@ -944,9 +969,21 @@ need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
>  static bool
>  enable_timeslice(struct intel_engine_cs *engine)
>  {
> -	struct i915_request *last = last_active(&engine->execlists);
> +	struct i915_request * const *port;
> +	int hint;
> +
> +	port = engine->execlists.active;
> +	while (port[0] && i915_request_completed(port[0]))
> +		port++;
> +	if (!port[0])
> +		return false;
>  
> -	return last && need_timeslice(engine, last);
> +	hint = engine->execlists.queue_priority_hint;
> +	if (port[1])
> +		hint = max(rq_prio(port[1]), hint);
> +
> +	/* Compare the two end-points as an unlocked approximation */
> +	return hint >= effective_prio(port[0]);

What happens if we get it wrong?

>  }
>  
>  static void record_preemption(struct intel_engine_execlists *execlists)
> @@ -1356,7 +1393,6 @@ static void process_csb(struct intel_engine_cs *engine)
>  	const u8 num_entries = execlists->csb_size;
>  	u8 head, tail;
>  
> -	lockdep_assert_held(&engine->active.lock);
>  	GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915));
>  
>  	/*
> @@ -1427,15 +1463,14 @@ static void process_csb(struct intel_engine_cs *engine)
>  				       execlists->pending,
>  				       execlists_num_ports(execlists) *
>  				       sizeof(*execlists->pending));
> -			execlists->pending[0] = NULL;
> -
> -			trace_ports(execlists, "promoted", execlists->active);
>  
>  			if (enable_timeslice(engine))
>  				mod_timer(&execlists->timer, jiffies + 1);
>  
>  			if (!inject_preempt_hang(execlists))
>  				ring_set_paused(engine, 0);
> +
> +			WRITE_ONCE(execlists->pending[0], NULL);
>  			break;
>  
>  		case CSB_COMPLETE: /* port0 completed, advanced to port1 */
> @@ -1479,8 +1514,6 @@ static void process_csb(struct intel_engine_cs *engine)
>  static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
>  {
>  	lockdep_assert_held(&engine->active.lock);
> -
> -	process_csb(engine);
>  	if (!engine->execlists.pending[0])
>  		execlists_dequeue(engine);
>  }
> @@ -1494,9 +1527,12 @@ static void execlists_submission_tasklet(unsigned long data)
>  	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&engine->active.lock, flags);
> -	__execlists_submission_tasklet(engine);
> -	spin_unlock_irqrestore(&engine->active.lock, flags);
> +	process_csb(engine);
> +	if (!engine->execlists.pending[0]) {

!READ_ONCE(...)? Would atleast pair documentatically.

> +		spin_lock_irqsave(&engine->active.lock, flags);
> +		__execlists_submission_tasklet(engine);
> +		spin_unlock_irqrestore(&engine->active.lock, flags);
> +	}
>  }
>  
>  static void execlists_submission_timer(struct timer_list *timer)
> diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> index d652ba5d2320..562f756da421 100644
> --- a/drivers/gpu/drm/i915/i915_utils.h
> +++ b/drivers/gpu/drm/i915/i915_utils.h
> @@ -161,17 +161,15 @@ __check_struct_size(size_t base, size_t arr, size_t count, size_t *size)
>  	((typeof(ptr))((unsigned long)(ptr) | __bits));			\
>  })
>  
> -#define ptr_count_dec(p_ptr) do {					\
> -	typeof(p_ptr) __p = (p_ptr);					\
> -	unsigned long __v = (unsigned long)(*__p);			\
> -	*__p = (typeof(*p_ptr))(--__v);					\
> -} while (0)
> -
> -#define ptr_count_inc(p_ptr) do {					\
> -	typeof(p_ptr) __p = (p_ptr);					\
> -	unsigned long __v = (unsigned long)(*__p);			\
> -	*__p = (typeof(*p_ptr))(++__v);					\
> -} while (0)
> +#define ptr_dec(ptr) ({							\
> +	unsigned long __v = (unsigned long)(ptr);			\
> +	(typeof(ptr))(__v - 1);						\
> +})
> +
> +#define ptr_inc(ptr) ({							\
> +	unsigned long __v = (unsigned long)(ptr);			\
> +	(typeof(ptr))(__v + 1);						\
> +})

Should we add GEM_DEBUG_WARN_ON if we overflow or do we
detect through other means?
-Mika

>  
>  #define page_mask_bits(ptr) ptr_mask_bits(ptr, PAGE_SHIFT)
>  #define page_unmask_bits(ptr) ptr_unmask_bits(ptr, PAGE_SHIFT)
> -- 
> 2.23.0.rc1
Chris Wilson Aug. 16, 2019, 8:50 a.m. UTC | #2
Quoting Mika Kuoppala (2019-08-16 08:50:29)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >  static inline struct i915_request *
> >  execlists_schedule_in(struct i915_request *rq, int idx)
> >  {
> > -     struct intel_context *ce = rq->hw_context;
> > -     int count;
> > +     struct intel_context * const ce = rq->hw_context;
> > +     struct intel_engine_cs *old;
> >  
> > +     GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
> >       trace_i915_request_in(rq, idx);
> >  
> > -     count = intel_context_inflight_count(ce);
> > -     if (!count) {
> > -             intel_context_get(ce);
> > -             ce->inflight = rq->engine;
> > -
> > -             intel_gt_pm_get(ce->inflight->gt);
> > -             execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> > -             intel_engine_context_in(ce->inflight);
> > -     }
> > +     old = READ_ONCE(ce->inflight);
> > +     do {
> > +             if (!old) {
> 
> The schedule out might have swapped inflight in here ruining our day.
> So I am here trying to figure out how you can pull it off.

Once we _know_ the context is idle, the only way it can become busy again
is via submitting a request under the engine->active.lock, which we
hold.

Note that schedule-out also has exclusive access by its caller (only one
submission tasklet at a time), but schedule-out may run concurrently to
schedule-in. (But once we idle a context in schedule-out, we will never
see it again until a schedule-in, hence we know that upon seeing NULL we
have exclusive access.)

> > +                     WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
> > +                     break;
> > +             }
> > +     } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
> >  
> > -     intel_context_inflight_inc(ce);
> >       GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> > -
> >       return i915_request_get(rq);
> >  }

> >  enable_timeslice(struct intel_engine_cs *engine)
> >  {
> > -     struct i915_request *last = last_active(&engine->execlists);
> > +     struct i915_request * const *port;
> > +     int hint;
> > +
> > +     port = engine->execlists.active;
> > +     while (port[0] && i915_request_completed(port[0]))
> > +             port++;
> > +     if (!port[0])
> > +             return false;
> >  
> > -     return last && need_timeslice(engine, last);
> > +     hint = engine->execlists.queue_priority_hint;
> > +     if (port[1])
> > +             hint = max(rq_prio(port[1]), hint);
> > +
> > +     /* Compare the two end-points as an unlocked approximation */
> > +     return hint >= effective_prio(port[0]);
> 
> What happens if we get it wrong?

So the last element in the next context is always the lowest priority
(normally they are all the same priority). If there is a variation in
priority along the requests in the second context, that may mask that
the first request there should trigger a timeslice.

Storing an execlists.switch_priority_hint should remove the ambiguity.

> > @@ -1494,9 +1527,12 @@ static void execlists_submission_tasklet(unsigned long data)
> >       struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> >       unsigned long flags;
> >  
> > -     spin_lock_irqsave(&engine->active.lock, flags);
> > -     __execlists_submission_tasklet(engine);
> > -     spin_unlock_irqrestore(&engine->active.lock, flags);
> > +     process_csb(engine);
> > +     if (!engine->execlists.pending[0]) {
> 
> !READ_ONCE(...)? Would atleast pair documentatically.

How about if (process_csb()) {

> > +             spin_lock_irqsave(&engine->active.lock, flags);
> > +             __execlists_submission_tasklet(engine);
> > +             spin_unlock_irqrestore(&engine->active.lock, flags);
> > +     }
> >  }
> >  
> >  static void execlists_submission_timer(struct timer_list *timer)
> > diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> > index d652ba5d2320..562f756da421 100644
> > --- a/drivers/gpu/drm/i915/i915_utils.h
> > +++ b/drivers/gpu/drm/i915/i915_utils.h
> > @@ -161,17 +161,15 @@ __check_struct_size(size_t base, size_t arr, size_t count, size_t *size)
> >       ((typeof(ptr))((unsigned long)(ptr) | __bits));                 \
> >  })
> >  
> > -#define ptr_count_dec(p_ptr) do {                                    \
> > -     typeof(p_ptr) __p = (p_ptr);                                    \
> > -     unsigned long __v = (unsigned long)(*__p);                      \
> > -     *__p = (typeof(*p_ptr))(--__v);                                 \
> > -} while (0)
> > -
> > -#define ptr_count_inc(p_ptr) do {                                    \
> > -     typeof(p_ptr) __p = (p_ptr);                                    \
> > -     unsigned long __v = (unsigned long)(*__p);                      \
> > -     *__p = (typeof(*p_ptr))(++__v);                                 \
> > -} while (0)
> > +#define ptr_dec(ptr) ({                                                      \
> > +     unsigned long __v = (unsigned long)(ptr);                       \
> > +     (typeof(ptr))(__v - 1);                                         \
> > +})
> > +
> > +#define ptr_inc(ptr) ({                                                      \
> > +     unsigned long __v = (unsigned long)(ptr);                       \
> > +     (typeof(ptr))(__v + 1);                                         \
> > +})
> 
> Should we add GEM_DEBUG_WARN_ON if we overflow or do we
> detect through other means?

What's an overflow? What's the alignment of the target pointer? Perhaps
the user intended that every 8 uses starts at the next location. That's
not known at this basic level. And these are decidedly
use-at-you-own-risk :)
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index a632b20ec4d8..d8ce266c049f 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -41,9 +41,7 @@  struct intel_context {
 	struct intel_engine_cs *engine;
 	struct intel_engine_cs *inflight;
 #define intel_context_inflight(ce) ptr_mask_bits((ce)->inflight, 2)
-#define intel_context_inflight_count(ce)  ptr_unmask_bits((ce)->inflight, 2)
-#define intel_context_inflight_inc(ce) ptr_count_inc(&(ce)->inflight)
-#define intel_context_inflight_dec(ce) ptr_count_dec(&(ce)->inflight)
+#define intel_context_inflight_count(ce) ptr_unmask_bits((ce)->inflight, 2)
 
 	struct i915_address_space *vm;
 	struct i915_gem_context *gem_context;
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index d20750e420c0..abd4fde2e52d 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1460,7 +1460,7 @@  int intel_enable_engine_stats(struct intel_engine_cs *engine)
 
 		for (port = execlists->pending; (rq = *port); port++) {
 			/* Exclude any contexts already counted in active */
-			if (intel_context_inflight_count(rq->hw_context) == 1)
+			if (!intel_context_inflight_count(rq->hw_context))
 				engine->stats.active++;
 		}
 
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 5c26c4ae139b..09d8cb8615cf 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -547,27 +547,39 @@  execlists_context_status_change(struct i915_request *rq, unsigned long status)
 				   status, rq);
 }
 
+static inline struct intel_engine_cs *
+__execlists_schedule_in(struct i915_request *rq)
+{
+	struct intel_engine_cs * const engine = rq->engine;
+	struct intel_context * const ce = rq->hw_context;
+
+	intel_context_get(ce);
+
+	intel_gt_pm_get(engine->gt);
+	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
+	intel_engine_context_in(engine);
+
+	return engine;
+}
+
 static inline struct i915_request *
 execlists_schedule_in(struct i915_request *rq, int idx)
 {
-	struct intel_context *ce = rq->hw_context;
-	int count;
+	struct intel_context * const ce = rq->hw_context;
+	struct intel_engine_cs *old;
 
+	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
 	trace_i915_request_in(rq, idx);
 
-	count = intel_context_inflight_count(ce);
-	if (!count) {
-		intel_context_get(ce);
-		ce->inflight = rq->engine;
-
-		intel_gt_pm_get(ce->inflight->gt);
-		execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
-		intel_engine_context_in(ce->inflight);
-	}
+	old = READ_ONCE(ce->inflight);
+	do {
+		if (!old) {
+			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
+			break;
+		}
+	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
 
-	intel_context_inflight_inc(ce);
 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
-
 	return i915_request_get(rq);
 }
 
@@ -581,35 +593,45 @@  static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
 }
 
 static inline void
-execlists_schedule_out(struct i915_request *rq)
+__execlists_schedule_out(struct i915_request *rq,
+			 struct intel_engine_cs * const engine)
 {
-	struct intel_context *ce = rq->hw_context;
+	struct intel_context * const ce = rq->hw_context;
 
-	GEM_BUG_ON(!intel_context_inflight_count(ce));
+	intel_engine_context_out(engine);
+	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
+	intel_gt_pm_put(engine->gt);
 
-	trace_i915_request_out(rq);
+	/*
+	 * If this is part of a virtual engine, its next request may
+	 * have been blocked waiting for access to the active context.
+	 * We have to kick all the siblings again in case we need to
+	 * switch (e.g. the next request is not runnable on this
+	 * engine). Hopefully, we will already have submitted the next
+	 * request before the tasklet runs and do not need to rebuild
+	 * each virtual tree and kick everyone again.
+	 */
+	if (ce->engine != engine)
+		kick_siblings(rq, ce);
 
-	intel_context_inflight_dec(ce);
-	if (!intel_context_inflight_count(ce)) {
-		intel_engine_context_out(ce->inflight);
-		execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
-		intel_gt_pm_put(ce->inflight->gt);
+	intel_context_put(ce);
+}
 
-		/*
-		 * If this is part of a virtual engine, its next request may
-		 * have been blocked waiting for access to the active context.
-		 * We have to kick all the siblings again in case we need to
-		 * switch (e.g. the next request is not runnable on this
-		 * engine). Hopefully, we will already have submitted the next
-		 * request before the tasklet runs and do not need to rebuild
-		 * each virtual tree and kick everyone again.
-		 */
-		ce->inflight = NULL;
-		if (rq->engine != ce->engine)
-			kick_siblings(rq, ce);
+static inline void
+execlists_schedule_out(struct i915_request *rq)
+{
+	struct intel_context * const ce = rq->hw_context;
+	struct intel_engine_cs *cur, *old;
 
-		intel_context_put(ce);
-	}
+	trace_i915_request_out(rq);
+	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
+
+	old = READ_ONCE(ce->inflight);
+	do
+		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
+	while (!try_cmpxchg(&ce->inflight, &old, cur));
+	if (!cur)
+		__execlists_schedule_out(rq, old);
 
 	i915_request_put(rq);
 }
@@ -684,6 +706,9 @@  assert_pending_valid(const struct intel_engine_execlists *execlists,
 
 	trace_ports(execlists, msg, execlists->pending);
 
+	if (!execlists->pending[0])
+		return false;
+
 	if (execlists->pending[execlists_num_ports(execlists)])
 		return false;
 
@@ -944,9 +969,21 @@  need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
 static bool
 enable_timeslice(struct intel_engine_cs *engine)
 {
-	struct i915_request *last = last_active(&engine->execlists);
+	struct i915_request * const *port;
+	int hint;
+
+	port = engine->execlists.active;
+	while (port[0] && i915_request_completed(port[0]))
+		port++;
+	if (!port[0])
+		return false;
 
-	return last && need_timeslice(engine, last);
+	hint = engine->execlists.queue_priority_hint;
+	if (port[1])
+		hint = max(rq_prio(port[1]), hint);
+
+	/* Compare the two end-points as an unlocked approximation */
+	return hint >= effective_prio(port[0]);
 }
 
 static void record_preemption(struct intel_engine_execlists *execlists)
@@ -1356,7 +1393,6 @@  static void process_csb(struct intel_engine_cs *engine)
 	const u8 num_entries = execlists->csb_size;
 	u8 head, tail;
 
-	lockdep_assert_held(&engine->active.lock);
 	GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915));
 
 	/*
@@ -1427,15 +1463,14 @@  static void process_csb(struct intel_engine_cs *engine)
 				       execlists->pending,
 				       execlists_num_ports(execlists) *
 				       sizeof(*execlists->pending));
-			execlists->pending[0] = NULL;
-
-			trace_ports(execlists, "promoted", execlists->active);
 
 			if (enable_timeslice(engine))
 				mod_timer(&execlists->timer, jiffies + 1);
 
 			if (!inject_preempt_hang(execlists))
 				ring_set_paused(engine, 0);
+
+			WRITE_ONCE(execlists->pending[0], NULL);
 			break;
 
 		case CSB_COMPLETE: /* port0 completed, advanced to port1 */
@@ -1479,8 +1514,6 @@  static void process_csb(struct intel_engine_cs *engine)
 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
 {
 	lockdep_assert_held(&engine->active.lock);
-
-	process_csb(engine);
 	if (!engine->execlists.pending[0])
 		execlists_dequeue(engine);
 }
@@ -1494,9 +1527,12 @@  static void execlists_submission_tasklet(unsigned long data)
 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
 	unsigned long flags;
 
-	spin_lock_irqsave(&engine->active.lock, flags);
-	__execlists_submission_tasklet(engine);
-	spin_unlock_irqrestore(&engine->active.lock, flags);
+	process_csb(engine);
+	if (!engine->execlists.pending[0]) {
+		spin_lock_irqsave(&engine->active.lock, flags);
+		__execlists_submission_tasklet(engine);
+		spin_unlock_irqrestore(&engine->active.lock, flags);
+	}
 }
 
 static void execlists_submission_timer(struct timer_list *timer)
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index d652ba5d2320..562f756da421 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -161,17 +161,15 @@  __check_struct_size(size_t base, size_t arr, size_t count, size_t *size)
 	((typeof(ptr))((unsigned long)(ptr) | __bits));			\
 })
 
-#define ptr_count_dec(p_ptr) do {					\
-	typeof(p_ptr) __p = (p_ptr);					\
-	unsigned long __v = (unsigned long)(*__p);			\
-	*__p = (typeof(*p_ptr))(--__v);					\
-} while (0)
-
-#define ptr_count_inc(p_ptr) do {					\
-	typeof(p_ptr) __p = (p_ptr);					\
-	unsigned long __v = (unsigned long)(*__p);			\
-	*__p = (typeof(*p_ptr))(++__v);					\
-} while (0)
+#define ptr_dec(ptr) ({							\
+	unsigned long __v = (unsigned long)(ptr);			\
+	(typeof(ptr))(__v - 1);						\
+})
+
+#define ptr_inc(ptr) ({							\
+	unsigned long __v = (unsigned long)(ptr);			\
+	(typeof(ptr))(__v + 1);						\
+})
 
 #define page_mask_bits(ptr) ptr_mask_bits(ptr, PAGE_SHIFT)
 #define page_unmask_bits(ptr) ptr_unmask_bits(ptr, PAGE_SHIFT)