diff mbox series

[42/51] drm/i915/guc: Implement banned contexts for GuC submission

Message ID 20210716201724.54804-43-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series GuC submission support | expand

Commit Message

Matthew Brost July 16, 2021, 8:17 p.m. UTC
When using GuC submission, if a context gets banned disable scheduling
and mark all inflight requests as complete.

Cc: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
 drivers/gpu/drm/i915/gt/intel_context.h       |  13 ++
 drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c         |  32 +---
 .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |   2 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 ++++++++++++++++--
 drivers/gpu/drm/i915/i915_trace.h             |  10 ++
 8 files changed, 195 insertions(+), 37 deletions(-)

Comments

John Harrison July 20, 2021, 9:41 p.m. UTC | #1
On 7/16/2021 13:17, Matthew Brost wrote:
> When using GuC submission, if a context gets banned disable scheduling
> and mark all inflight requests as complete.
>
> Cc: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>

> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
>   drivers/gpu/drm/i915/gt/intel_context.h       |  13 ++
>   drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
>   drivers/gpu/drm/i915/gt/intel_reset.c         |  32 +---
>   .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |   2 +
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 ++++++++++++++++--
>   drivers/gpu/drm/i915/i915_trace.h             |  10 ++
>   8 files changed, 195 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 28c62f7ccfc7..d87a4c6da5bc 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban)
>   	for_each_gem_engine(ce, engines, it) {
>   		struct intel_engine_cs *engine;
>   
> -		if (ban && intel_context_set_banned(ce))
> +		if (ban && intel_context_ban(ce, NULL))
>   			continue;
>   
>   		/*
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index 2ed9bf5f91a5..814d9277096a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -16,6 +16,7 @@
>   #include "intel_engine_types.h"
>   #include "intel_ring_types.h"
>   #include "intel_timeline_types.h"
> +#include "i915_trace.h"
>   
>   #define CE_TRACE(ce, fmt, ...) do {					\
>   	const struct intel_context *ce__ = (ce);			\
> @@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct intel_context *ce)
>   	return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
>   }
>   
> +static inline bool intel_context_ban(struct intel_context *ce,
> +				     struct i915_request *rq)
> +{
> +	bool ret = intel_context_set_banned(ce);
> +
> +	trace_intel_context_ban(ce);
> +	if (ce->ops->ban)
> +		ce->ops->ban(ce, rq);
> +
> +	return ret;
> +}
> +
>   static inline bool
>   intel_context_force_single_submission(const struct intel_context *ce)
>   {
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 035108c10b2c..57c19ee3e313 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -35,6 +35,8 @@ struct intel_context_ops {
>   
>   	int (*alloc)(struct intel_context *ce);
>   
> +	void (*ban)(struct intel_context *ce, struct i915_request *rq);
> +
>   	int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void **vaddr);
>   	int (*pin)(struct intel_context *ce, void *vaddr);
>   	void (*unpin)(struct intel_context *ce);
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index f3cdbf4ba5c8..3ed694cab5af 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -22,7 +22,6 @@
>   #include "intel_reset.h"
>   
>   #include "uc/intel_guc.h"
> -#include "uc/intel_guc_submission.h"
>   
>   #define RESET_MAX_RETRIES 3
>   
> @@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
>   	intel_uncore_rmw_fw(uncore, reg, clr, 0);
>   }
>   
> -static void skip_context(struct i915_request *rq)
> -{
> -	struct intel_context *hung_ctx = rq->context;
> -
> -	list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) {
> -		if (!i915_request_is_active(rq))
> -			return;
> -
> -		if (rq->context == hung_ctx) {
> -			i915_request_set_error_once(rq, -EIO);
> -			__i915_request_skip(rq);
> -		}
> -	}
> -}
> -
>   static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
>   {
>   	struct drm_i915_file_private *file_priv = ctx->file_priv;
> @@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq)
>   	bool banned;
>   	int i;
>   
> -	if (intel_context_is_closed(rq->context)) {
> -		intel_context_set_banned(rq->context);
> +	if (intel_context_is_closed(rq->context))
>   		return true;
> -	}
>   
>   	rcu_read_lock();
>   	ctx = rcu_dereference(rq->context->gem_context);
> @@ -123,11 +105,9 @@ static bool mark_guilty(struct i915_request *rq)
>   	banned = !i915_gem_context_is_recoverable(ctx);
>   	if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
>   		banned = true;
> -	if (banned) {
> +	if (banned)
>   		drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
>   			ctx->name, atomic_read(&ctx->guilty_count));
> -		intel_context_set_banned(rq->context);
> -	}
>   
>   	client_mark_guilty(ctx, banned);
>   
> @@ -149,6 +129,8 @@ static void mark_innocent(struct i915_request *rq)
>   
>   void __i915_request_reset(struct i915_request *rq, bool guilty)
>   {
> +	bool banned = false;
> +
>   	RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
>   	GEM_BUG_ON(__i915_request_is_complete(rq));
>   
> @@ -156,13 +138,15 @@ void __i915_request_reset(struct i915_request *rq, bool guilty)
>   	if (guilty) {
>   		i915_request_set_error_once(rq, -EIO);
>   		__i915_request_skip(rq);
> -		if (mark_guilty(rq) && !intel_engine_uses_guc(rq->engine))
> -			skip_context(rq);
> +		banned = mark_guilty(rq);
>   	} else {
>   		i915_request_set_error_once(rq, -EAGAIN);
>   		mark_innocent(rq);
>   	}
>   	rcu_read_unlock();
> +
> +	if (banned)
> +		intel_context_ban(rq->context, rq);
>   }
>   
>   static bool i915_in_reset(struct pci_dev *pdev)
> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> index 3b1471c50d40..03939be4297e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> @@ -586,9 +586,29 @@ static void ring_context_reset(struct intel_context *ce)
>   	clear_bit(CONTEXT_VALID_BIT, &ce->flags);
>   }
>   
> +static void ring_context_ban(struct intel_context *ce,
> +			     struct i915_request *rq)
> +{
> +	struct intel_engine_cs *engine;
> +
> +	if (!rq || !i915_request_is_active(rq))
> +		return;
> +
> +	engine = rq->engine;
> +	lockdep_assert_held(&engine->sched_engine->lock);
> +	list_for_each_entry_continue(rq, &engine->sched_engine->requests,
> +				     sched.link)
> +		if (rq->context == ce) {
> +			i915_request_set_error_once(rq, -EIO);
> +			__i915_request_skip(rq);
> +		}
> +}
> +
>   static const struct intel_context_ops ring_context_ops = {
>   	.alloc = ring_context_alloc,
>   
> +	.ban = ring_context_ban,
> +
>   	.pre_pin = ring_context_pre_pin,
>   	.pin = ring_context_pin,
>   	.unpin = ring_context_unpin,
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index dc18ac510ac8..eb6062f95d3b 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -274,6 +274,8 @@ void intel_guc_find_hung_context(struct intel_engine_cs *engine);
>   
>   int intel_guc_global_policies_update(struct intel_guc *guc);
>   
> +void intel_guc_context_ban(struct intel_context *ce, struct i915_request *rq);
> +
>   void intel_guc_submission_reset_prepare(struct intel_guc *guc);
>   void intel_guc_submission_reset(struct intel_guc *guc, bool stalled);
>   void intel_guc_submission_reset_finish(struct intel_guc *guc);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 6536bd6807a0..149990196e3a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -125,6 +125,7 @@ static inline void clr_context_pending_enable(struct intel_context *ce)
>   #define SCHED_STATE_WAIT_FOR_DEREGISTER_TO_REGISTER	BIT(0)
>   #define SCHED_STATE_DESTROYED				BIT(1)
>   #define SCHED_STATE_PENDING_DISABLE			BIT(2)
> +#define SCHED_STATE_BANNED				BIT(3)
>   static inline void init_sched_state(struct intel_context *ce)
>   {
>   	/* Only should be called from guc_lrc_desc_pin() */
> @@ -187,6 +188,23 @@ static inline void clr_context_pending_disable(struct intel_context *ce)
>   		(ce->guc_state.sched_state & ~SCHED_STATE_PENDING_DISABLE);
>   }
>   
> +static inline bool context_banned(struct intel_context *ce)
> +{
> +	return (ce->guc_state.sched_state & SCHED_STATE_BANNED);
> +}
> +
> +static inline void set_context_banned(struct intel_context *ce)
> +{
> +	lockdep_assert_held(&ce->guc_state.lock);
> +	ce->guc_state.sched_state |= SCHED_STATE_BANNED;
> +}
> +
> +static inline void clr_context_banned(struct intel_context *ce)
> +{
> +	lockdep_assert_held(&ce->guc_state.lock);
> +	ce->guc_state.sched_state &= ~SCHED_STATE_BANNED;
> +}
> +
>   static inline bool context_guc_id_invalid(struct intel_context *ce)
>   {
>   	return (ce->guc_id == GUC_INVALID_LRC_ID);
> @@ -356,13 +374,23 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop);
>   
>   static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
>   {
> -	int err;
> +	int err = 0;
>   	struct intel_context *ce = rq->context;
>   	u32 action[3];
>   	int len = 0;
>   	u32 g2h_len_dw = 0;
>   	bool enabled;
>   
> +	/*
> +	 * Corner case where requests were sitting in the priority list or a
> +	 * request resubmitted after the context was banned.
> +	 */
> +	if (unlikely(intel_context_is_banned(ce))) {
> +		i915_request_put(i915_request_mark_eio(rq));
> +		intel_engine_signal_breadcrumbs(ce->engine);
> +		goto out;
> +	}
> +
>   	GEM_BUG_ON(!atomic_read(&ce->guc_id_ref));
>   	GEM_BUG_ON(context_guc_id_invalid(ce));
>   
> @@ -398,6 +426,8 @@ static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
>   		clr_context_pending_enable(ce);
>   		intel_context_put(ce);
>   	}
> +	if (likely(!err))
> +		trace_i915_request_guc_submit(rq);
>   
>   out:
>   	return err;
> @@ -462,7 +492,6 @@ static int guc_dequeue_one_context(struct intel_guc *guc)
>   			guc->stalled_request = last;
>   			return false;
>   		}
> -		trace_i915_request_guc_submit(last);
>   	}
>   
>   	guc->stalled_request = NULL;
> @@ -501,12 +530,13 @@ static void cs_irq_handler(struct intel_engine_cs *engine, u16 iir)
>   static void __guc_context_destroy(struct intel_context *ce);
>   static void release_guc_id(struct intel_guc *guc, struct intel_context *ce);
>   static void guc_signal_context_fence(struct intel_context *ce);
> +static void guc_cancel_context_requests(struct intel_context *ce);
>   
>   static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>   {
>   	struct intel_context *ce;
>   	unsigned long index, flags;
> -	bool pending_disable, pending_enable, deregister, destroyed;
> +	bool pending_disable, pending_enable, deregister, destroyed, banned;
>   
>   	xa_for_each(&guc->context_lookup, index, ce) {
>   		/* Flush context */
> @@ -524,6 +554,7 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>   		pending_enable = context_pending_enable(ce);
>   		pending_disable = context_pending_disable(ce);
>   		deregister = context_wait_for_deregister_to_register(ce);
> +		banned = context_banned(ce);
>   		init_sched_state(ce);
>   
>   		if (pending_enable || destroyed || deregister) {
> @@ -541,6 +572,10 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
>   		/* Not mutualy exclusive with above if statement. */
>   		if (pending_disable) {
>   			guc_signal_context_fence(ce);
> +			if (banned) {
> +				guc_cancel_context_requests(ce);
> +				intel_engine_signal_breadcrumbs(ce->engine);
> +			}
>   			intel_context_sched_disable_unpin(ce);
>   			atomic_dec(&guc->outstanding_submission_g2h);
>   			intel_context_put(ce);
> @@ -659,6 +694,9 @@ static void guc_reset_state(struct intel_context *ce, u32 head, bool scrub)
>   {
>   	struct intel_engine_cs *engine = __context_to_physical_engine(ce);
>   
> +	if (intel_context_is_banned(ce))
> +		return;
> +
>   	GEM_BUG_ON(!intel_context_is_pinned(ce));
>   
>   	/*
> @@ -729,6 +767,8 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled)
>   	struct i915_request *rq;
>   	u32 head;
>   
> +	intel_context_get(ce);
> +
>   	/*
>   	 * GuC will implicitly mark the context as non-schedulable
>   	 * when it sends the reset notification. Make sure our state
> @@ -754,6 +794,7 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled)
>   out_replay:
>   	guc_reset_state(ce, head, stalled);
>   	__unwind_incomplete_requests(ce);
> +	intel_context_put(ce);
>   }
>   
>   void intel_guc_submission_reset(struct intel_guc *guc, bool stalled)
> @@ -936,8 +977,6 @@ static int guc_bypass_tasklet_submit(struct intel_guc *guc,
>   	ret = guc_add_request(guc, rq);
>   	if (ret == -EBUSY)
>   		guc->stalled_request = rq;
> -	else
> -		trace_i915_request_guc_submit(rq);
>   
>   	if (unlikely(ret == -EPIPE))
>   		disable_submission(guc);
> @@ -1329,13 +1368,77 @@ static u16 prep_context_pending_disable(struct intel_context *ce)
>   	return ce->guc_id;
>   }
>   
> +static void __guc_context_set_preemption_timeout(struct intel_guc *guc,
> +						 u16 guc_id,
> +						 u32 preemption_timeout)
> +{
> +	u32 action [] = {
> +		INTEL_GUC_ACTION_SET_CONTEXT_PREEMPTION_TIMEOUT,
> +		guc_id,
> +		preemption_timeout
> +	};
> +
> +	intel_guc_send_busy_loop(guc, action, ARRAY_SIZE(action), 0, true);
> +}
> +
> +static void guc_context_ban(struct intel_context *ce, struct i915_request *rq)
> +{
> +	struct intel_guc *guc = ce_to_guc(ce);
> +	struct intel_runtime_pm *runtime_pm =
> +		&ce->engine->gt->i915->runtime_pm;
> +	intel_wakeref_t wakeref;
> +	unsigned long flags;
> +
> +	guc_flush_submissions(guc);
> +
> +	spin_lock_irqsave(&ce->guc_state.lock, flags);
> +	set_context_banned(ce);
> +
> +	if (submission_disabled(guc) || (!context_enabled(ce) &&
> +	    !context_pending_disable(ce))) {
> +		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> +
> +		guc_cancel_context_requests(ce);
> +		intel_engine_signal_breadcrumbs(ce->engine);
> +	} else if (!context_pending_disable(ce)) {
> +		u16 guc_id;
> +
> +		/*
> +		 * We add +2 here as the schedule disable complete CTB handler
> +		 * calls intel_context_sched_disable_unpin (-2 to pin_count).
> +		 */
> +		atomic_add(2, &ce->pin_count);
> +
> +		guc_id = prep_context_pending_disable(ce);
> +		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> +
> +		/*
> +		 * In addition to disabling scheduling, set the preemption
> +		 * timeout to the minimum value (1 us) so the banned context
> +		 * gets kicked off the HW ASAP.
> +		 */
> +		with_intel_runtime_pm(runtime_pm, wakeref) {
> +			__guc_context_set_preemption_timeout(guc, guc_id, 1);
> +			__guc_context_sched_disable(guc, ce, guc_id);
> +		}
> +	} else {
> +		if (!context_guc_id_invalid(ce))
> +			with_intel_runtime_pm(runtime_pm, wakeref)
> +				__guc_context_set_preemption_timeout(guc,
> +								     ce->guc_id,
> +								     1);
> +		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> +	}
> +}
> +
>   static void guc_context_sched_disable(struct intel_context *ce)
>   {
>   	struct intel_guc *guc = ce_to_guc(ce);
> -	struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm;
>   	unsigned long flags;
> -	u16 guc_id;
> +	struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm;
>   	intel_wakeref_t wakeref;
> +	u16 guc_id;
> +	bool enabled;
>   
>   	if (submission_disabled(guc) || context_guc_id_invalid(ce) ||
>   	    !lrc_desc_registered(guc, ce->guc_id)) {
> @@ -1349,14 +1452,22 @@ static void guc_context_sched_disable(struct intel_context *ce)
>   	spin_lock_irqsave(&ce->guc_state.lock, flags);
>   
>   	/*
> -	 * We have to check if the context has been pinned again as another pin
> -	 * operation is allowed to pass this function. Checking the pin count,
> -	 * within ce->guc_state.lock, synchronizes this function with
> +	 * We have to check if the context has been disabled by another thread.
> +	 * We also have to check if the context has been pinned again as another
> +	 * pin operation is allowed to pass this function. Checking the pin
> +	 * count, within ce->guc_state.lock, synchronizes this function with
>   	 * guc_request_alloc ensuring a request doesn't slip through the
>   	 * 'context_pending_disable' fence. Checking within the spin lock (can't
>   	 * sleep) ensures another process doesn't pin this context and generate
>   	 * a request before we set the 'context_pending_disable' flag here.
>   	 */
> +	enabled = context_enabled(ce);
> +	if (unlikely(!enabled || submission_disabled(guc))) {
> +		if (enabled)
> +			clr_context_enabled(ce);
> +		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> +		goto unpin;
> +	}
>   	if (unlikely(atomic_add_unless(&ce->pin_count, -2, 2))) {
>   		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
>   		return;
> @@ -1513,6 +1624,8 @@ static const struct intel_context_ops guc_context_ops = {
>   	.unpin = guc_context_unpin,
>   	.post_unpin = guc_context_post_unpin,
>   
> +	.ban = guc_context_ban,
> +
>   	.enter = intel_context_enter_engine,
>   	.exit = intel_context_exit_engine,
>   
> @@ -1706,6 +1819,8 @@ static const struct intel_context_ops virtual_guc_context_ops = {
>   	.unpin = guc_context_unpin,
>   	.post_unpin = guc_context_post_unpin,
>   
> +	.ban = guc_context_ban,
> +
>   	.enter = guc_virtual_context_enter,
>   	.exit = guc_virtual_context_exit,
>   
> @@ -2159,6 +2274,8 @@ int intel_guc_sched_done_process_msg(struct intel_guc *guc,
>   	if (context_pending_enable(ce)) {
>   		clr_context_pending_enable(ce);
>   	} else if (context_pending_disable(ce)) {
> +		bool banned;
> +
>   		/*
>   		 * Unpin must be done before __guc_signal_context_fence,
>   		 * otherwise a race exists between the requests getting
> @@ -2169,9 +2286,16 @@ int intel_guc_sched_done_process_msg(struct intel_guc *guc,
>   		intel_context_sched_disable_unpin(ce);
>   
>   		spin_lock_irqsave(&ce->guc_state.lock, flags);
> +		banned = context_banned(ce);
> +		clr_context_banned(ce);
>   		clr_context_pending_disable(ce);
>   		__guc_signal_context_fence(ce);
>   		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> +
> +		if (banned) {
> +			guc_cancel_context_requests(ce);
> +			intel_engine_signal_breadcrumbs(ce->engine);
> +		}
>   	}
>   
>   	decr_outstanding_submission_g2h(guc);
> @@ -2206,8 +2330,11 @@ static void guc_handle_context_reset(struct intel_guc *guc,
>   				     struct intel_context *ce)
>   {
>   	trace_intel_context_reset(ce);
> -	capture_error_state(guc, ce);
> -	guc_context_replay(ce);
> +
> +	if (likely(!intel_context_is_banned(ce))) {
> +		capture_error_state(guc, ce);
> +		guc_context_replay(ce);
> +	}
>   }
>   
>   int intel_guc_context_reset_process_msg(struct intel_guc *guc,
> diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
> index c095c4d39456..937d3706af9b 100644
> --- a/drivers/gpu/drm/i915/i915_trace.h
> +++ b/drivers/gpu/drm/i915/i915_trace.h
> @@ -934,6 +934,11 @@ DEFINE_EVENT(intel_context, intel_context_reset,
>   	     TP_ARGS(ce)
>   );
>   
> +DEFINE_EVENT(intel_context, intel_context_ban,
> +	     TP_PROTO(struct intel_context *ce),
> +	     TP_ARGS(ce)
> +);
> +
>   DEFINE_EVENT(intel_context, intel_context_register,
>   	     TP_PROTO(struct intel_context *ce),
>   	     TP_ARGS(ce)
> @@ -1036,6 +1041,11 @@ trace_intel_context_reset(struct intel_context *ce)
>   {
>   }
>   
> +static inline void
> +trace_intel_context_ban(struct intel_context *ce)
> +{
> +}
> +
>   static inline void
>   trace_intel_context_register(struct intel_context *ce)
>   {
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 28c62f7ccfc7..d87a4c6da5bc 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1084,7 +1084,7 @@  static void kill_engines(struct i915_gem_engines *engines, bool ban)
 	for_each_gem_engine(ce, engines, it) {
 		struct intel_engine_cs *engine;
 
-		if (ban && intel_context_set_banned(ce))
+		if (ban && intel_context_ban(ce, NULL))
 			continue;
 
 		/*
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index 2ed9bf5f91a5..814d9277096a 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -16,6 +16,7 @@ 
 #include "intel_engine_types.h"
 #include "intel_ring_types.h"
 #include "intel_timeline_types.h"
+#include "i915_trace.h"
 
 #define CE_TRACE(ce, fmt, ...) do {					\
 	const struct intel_context *ce__ = (ce);			\
@@ -243,6 +244,18 @@  static inline bool intel_context_set_banned(struct intel_context *ce)
 	return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
 }
 
+static inline bool intel_context_ban(struct intel_context *ce,
+				     struct i915_request *rq)
+{
+	bool ret = intel_context_set_banned(ce);
+
+	trace_intel_context_ban(ce);
+	if (ce->ops->ban)
+		ce->ops->ban(ce, rq);
+
+	return ret;
+}
+
 static inline bool
 intel_context_force_single_submission(const struct intel_context *ce)
 {
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 035108c10b2c..57c19ee3e313 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -35,6 +35,8 @@  struct intel_context_ops {
 
 	int (*alloc)(struct intel_context *ce);
 
+	void (*ban)(struct intel_context *ce, struct i915_request *rq);
+
 	int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, void **vaddr);
 	int (*pin)(struct intel_context *ce, void *vaddr);
 	void (*unpin)(struct intel_context *ce);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index f3cdbf4ba5c8..3ed694cab5af 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -22,7 +22,6 @@ 
 #include "intel_reset.h"
 
 #include "uc/intel_guc.h"
-#include "uc/intel_guc_submission.h"
 
 #define RESET_MAX_RETRIES 3
 
@@ -39,21 +38,6 @@  static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
 	intel_uncore_rmw_fw(uncore, reg, clr, 0);
 }
 
-static void skip_context(struct i915_request *rq)
-{
-	struct intel_context *hung_ctx = rq->context;
-
-	list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) {
-		if (!i915_request_is_active(rq))
-			return;
-
-		if (rq->context == hung_ctx) {
-			i915_request_set_error_once(rq, -EIO);
-			__i915_request_skip(rq);
-		}
-	}
-}
-
 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
 {
 	struct drm_i915_file_private *file_priv = ctx->file_priv;
@@ -88,10 +72,8 @@  static bool mark_guilty(struct i915_request *rq)
 	bool banned;
 	int i;
 
-	if (intel_context_is_closed(rq->context)) {
-		intel_context_set_banned(rq->context);
+	if (intel_context_is_closed(rq->context))
 		return true;
-	}
 
 	rcu_read_lock();
 	ctx = rcu_dereference(rq->context->gem_context);
@@ -123,11 +105,9 @@  static bool mark_guilty(struct i915_request *rq)
 	banned = !i915_gem_context_is_recoverable(ctx);
 	if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
 		banned = true;
-	if (banned) {
+	if (banned)
 		drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
 			ctx->name, atomic_read(&ctx->guilty_count));
-		intel_context_set_banned(rq->context);
-	}
 
 	client_mark_guilty(ctx, banned);
 
@@ -149,6 +129,8 @@  static void mark_innocent(struct i915_request *rq)
 
 void __i915_request_reset(struct i915_request *rq, bool guilty)
 {
+	bool banned = false;
+
 	RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
 	GEM_BUG_ON(__i915_request_is_complete(rq));
 
@@ -156,13 +138,15 @@  void __i915_request_reset(struct i915_request *rq, bool guilty)
 	if (guilty) {
 		i915_request_set_error_once(rq, -EIO);
 		__i915_request_skip(rq);
-		if (mark_guilty(rq) && !intel_engine_uses_guc(rq->engine))
-			skip_context(rq);
+		banned = mark_guilty(rq);
 	} else {
 		i915_request_set_error_once(rq, -EAGAIN);
 		mark_innocent(rq);
 	}
 	rcu_read_unlock();
+
+	if (banned)
+		intel_context_ban(rq->context, rq);
 }
 
 static bool i915_in_reset(struct pci_dev *pdev)
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index 3b1471c50d40..03939be4297e 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -586,9 +586,29 @@  static void ring_context_reset(struct intel_context *ce)
 	clear_bit(CONTEXT_VALID_BIT, &ce->flags);
 }
 
+static void ring_context_ban(struct intel_context *ce,
+			     struct i915_request *rq)
+{
+	struct intel_engine_cs *engine;
+
+	if (!rq || !i915_request_is_active(rq))
+		return;
+
+	engine = rq->engine;
+	lockdep_assert_held(&engine->sched_engine->lock);
+	list_for_each_entry_continue(rq, &engine->sched_engine->requests,
+				     sched.link)
+		if (rq->context == ce) {
+			i915_request_set_error_once(rq, -EIO);
+			__i915_request_skip(rq);
+		}
+}
+
 static const struct intel_context_ops ring_context_ops = {
 	.alloc = ring_context_alloc,
 
+	.ban = ring_context_ban,
+
 	.pre_pin = ring_context_pre_pin,
 	.pin = ring_context_pin,
 	.unpin = ring_context_unpin,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index dc18ac510ac8..eb6062f95d3b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -274,6 +274,8 @@  void intel_guc_find_hung_context(struct intel_engine_cs *engine);
 
 int intel_guc_global_policies_update(struct intel_guc *guc);
 
+void intel_guc_context_ban(struct intel_context *ce, struct i915_request *rq);
+
 void intel_guc_submission_reset_prepare(struct intel_guc *guc);
 void intel_guc_submission_reset(struct intel_guc *guc, bool stalled);
 void intel_guc_submission_reset_finish(struct intel_guc *guc);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 6536bd6807a0..149990196e3a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -125,6 +125,7 @@  static inline void clr_context_pending_enable(struct intel_context *ce)
 #define SCHED_STATE_WAIT_FOR_DEREGISTER_TO_REGISTER	BIT(0)
 #define SCHED_STATE_DESTROYED				BIT(1)
 #define SCHED_STATE_PENDING_DISABLE			BIT(2)
+#define SCHED_STATE_BANNED				BIT(3)
 static inline void init_sched_state(struct intel_context *ce)
 {
 	/* Only should be called from guc_lrc_desc_pin() */
@@ -187,6 +188,23 @@  static inline void clr_context_pending_disable(struct intel_context *ce)
 		(ce->guc_state.sched_state & ~SCHED_STATE_PENDING_DISABLE);
 }
 
+static inline bool context_banned(struct intel_context *ce)
+{
+	return (ce->guc_state.sched_state & SCHED_STATE_BANNED);
+}
+
+static inline void set_context_banned(struct intel_context *ce)
+{
+	lockdep_assert_held(&ce->guc_state.lock);
+	ce->guc_state.sched_state |= SCHED_STATE_BANNED;
+}
+
+static inline void clr_context_banned(struct intel_context *ce)
+{
+	lockdep_assert_held(&ce->guc_state.lock);
+	ce->guc_state.sched_state &= ~SCHED_STATE_BANNED;
+}
+
 static inline bool context_guc_id_invalid(struct intel_context *ce)
 {
 	return (ce->guc_id == GUC_INVALID_LRC_ID);
@@ -356,13 +374,23 @@  static int guc_lrc_desc_pin(struct intel_context *ce, bool loop);
 
 static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 {
-	int err;
+	int err = 0;
 	struct intel_context *ce = rq->context;
 	u32 action[3];
 	int len = 0;
 	u32 g2h_len_dw = 0;
 	bool enabled;
 
+	/*
+	 * Corner case where requests were sitting in the priority list or a
+	 * request resubmitted after the context was banned.
+	 */
+	if (unlikely(intel_context_is_banned(ce))) {
+		i915_request_put(i915_request_mark_eio(rq));
+		intel_engine_signal_breadcrumbs(ce->engine);
+		goto out;
+	}
+
 	GEM_BUG_ON(!atomic_read(&ce->guc_id_ref));
 	GEM_BUG_ON(context_guc_id_invalid(ce));
 
@@ -398,6 +426,8 @@  static int guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 		clr_context_pending_enable(ce);
 		intel_context_put(ce);
 	}
+	if (likely(!err))
+		trace_i915_request_guc_submit(rq);
 
 out:
 	return err;
@@ -462,7 +492,6 @@  static int guc_dequeue_one_context(struct intel_guc *guc)
 			guc->stalled_request = last;
 			return false;
 		}
-		trace_i915_request_guc_submit(last);
 	}
 
 	guc->stalled_request = NULL;
@@ -501,12 +530,13 @@  static void cs_irq_handler(struct intel_engine_cs *engine, u16 iir)
 static void __guc_context_destroy(struct intel_context *ce);
 static void release_guc_id(struct intel_guc *guc, struct intel_context *ce);
 static void guc_signal_context_fence(struct intel_context *ce);
+static void guc_cancel_context_requests(struct intel_context *ce);
 
 static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 {
 	struct intel_context *ce;
 	unsigned long index, flags;
-	bool pending_disable, pending_enable, deregister, destroyed;
+	bool pending_disable, pending_enable, deregister, destroyed, banned;
 
 	xa_for_each(&guc->context_lookup, index, ce) {
 		/* Flush context */
@@ -524,6 +554,7 @@  static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 		pending_enable = context_pending_enable(ce);
 		pending_disable = context_pending_disable(ce);
 		deregister = context_wait_for_deregister_to_register(ce);
+		banned = context_banned(ce);
 		init_sched_state(ce);
 
 		if (pending_enable || destroyed || deregister) {
@@ -541,6 +572,10 @@  static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
 		/* Not mutualy exclusive with above if statement. */
 		if (pending_disable) {
 			guc_signal_context_fence(ce);
+			if (banned) {
+				guc_cancel_context_requests(ce);
+				intel_engine_signal_breadcrumbs(ce->engine);
+			}
 			intel_context_sched_disable_unpin(ce);
 			atomic_dec(&guc->outstanding_submission_g2h);
 			intel_context_put(ce);
@@ -659,6 +694,9 @@  static void guc_reset_state(struct intel_context *ce, u32 head, bool scrub)
 {
 	struct intel_engine_cs *engine = __context_to_physical_engine(ce);
 
+	if (intel_context_is_banned(ce))
+		return;
+
 	GEM_BUG_ON(!intel_context_is_pinned(ce));
 
 	/*
@@ -729,6 +767,8 @@  static void __guc_reset_context(struct intel_context *ce, bool stalled)
 	struct i915_request *rq;
 	u32 head;
 
+	intel_context_get(ce);
+
 	/*
 	 * GuC will implicitly mark the context as non-schedulable
 	 * when it sends the reset notification. Make sure our state
@@ -754,6 +794,7 @@  static void __guc_reset_context(struct intel_context *ce, bool stalled)
 out_replay:
 	guc_reset_state(ce, head, stalled);
 	__unwind_incomplete_requests(ce);
+	intel_context_put(ce);
 }
 
 void intel_guc_submission_reset(struct intel_guc *guc, bool stalled)
@@ -936,8 +977,6 @@  static int guc_bypass_tasklet_submit(struct intel_guc *guc,
 	ret = guc_add_request(guc, rq);
 	if (ret == -EBUSY)
 		guc->stalled_request = rq;
-	else
-		trace_i915_request_guc_submit(rq);
 
 	if (unlikely(ret == -EPIPE))
 		disable_submission(guc);
@@ -1329,13 +1368,77 @@  static u16 prep_context_pending_disable(struct intel_context *ce)
 	return ce->guc_id;
 }
 
+static void __guc_context_set_preemption_timeout(struct intel_guc *guc,
+						 u16 guc_id,
+						 u32 preemption_timeout)
+{
+	u32 action [] = {
+		INTEL_GUC_ACTION_SET_CONTEXT_PREEMPTION_TIMEOUT,
+		guc_id,
+		preemption_timeout
+	};
+
+	intel_guc_send_busy_loop(guc, action, ARRAY_SIZE(action), 0, true);
+}
+
+static void guc_context_ban(struct intel_context *ce, struct i915_request *rq)
+{
+	struct intel_guc *guc = ce_to_guc(ce);
+	struct intel_runtime_pm *runtime_pm =
+		&ce->engine->gt->i915->runtime_pm;
+	intel_wakeref_t wakeref;
+	unsigned long flags;
+
+	guc_flush_submissions(guc);
+
+	spin_lock_irqsave(&ce->guc_state.lock, flags);
+	set_context_banned(ce);
+
+	if (submission_disabled(guc) || (!context_enabled(ce) &&
+	    !context_pending_disable(ce))) {
+		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
+
+		guc_cancel_context_requests(ce);
+		intel_engine_signal_breadcrumbs(ce->engine);
+	} else if (!context_pending_disable(ce)) {
+		u16 guc_id;
+
+		/*
+		 * We add +2 here as the schedule disable complete CTB handler
+		 * calls intel_context_sched_disable_unpin (-2 to pin_count).
+		 */
+		atomic_add(2, &ce->pin_count);
+
+		guc_id = prep_context_pending_disable(ce);
+		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
+
+		/*
+		 * In addition to disabling scheduling, set the preemption
+		 * timeout to the minimum value (1 us) so the banned context
+		 * gets kicked off the HW ASAP.
+		 */
+		with_intel_runtime_pm(runtime_pm, wakeref) {
+			__guc_context_set_preemption_timeout(guc, guc_id, 1);
+			__guc_context_sched_disable(guc, ce, guc_id);
+		}
+	} else {
+		if (!context_guc_id_invalid(ce))
+			with_intel_runtime_pm(runtime_pm, wakeref)
+				__guc_context_set_preemption_timeout(guc,
+								     ce->guc_id,
+								     1);
+		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
+	}
+}
+
 static void guc_context_sched_disable(struct intel_context *ce)
 {
 	struct intel_guc *guc = ce_to_guc(ce);
-	struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm;
 	unsigned long flags;
-	u16 guc_id;
+	struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm;
 	intel_wakeref_t wakeref;
+	u16 guc_id;
+	bool enabled;
 
 	if (submission_disabled(guc) || context_guc_id_invalid(ce) ||
 	    !lrc_desc_registered(guc, ce->guc_id)) {
@@ -1349,14 +1452,22 @@  static void guc_context_sched_disable(struct intel_context *ce)
 	spin_lock_irqsave(&ce->guc_state.lock, flags);
 
 	/*
-	 * We have to check if the context has been pinned again as another pin
-	 * operation is allowed to pass this function. Checking the pin count,
-	 * within ce->guc_state.lock, synchronizes this function with
+	 * We have to check if the context has been disabled by another thread.
+	 * We also have to check if the context has been pinned again as another
+	 * pin operation is allowed to pass this function. Checking the pin
+	 * count, within ce->guc_state.lock, synchronizes this function with
 	 * guc_request_alloc ensuring a request doesn't slip through the
 	 * 'context_pending_disable' fence. Checking within the spin lock (can't
 	 * sleep) ensures another process doesn't pin this context and generate
 	 * a request before we set the 'context_pending_disable' flag here.
 	 */
+	enabled = context_enabled(ce);
+	if (unlikely(!enabled || submission_disabled(guc))) {
+		if (enabled)
+			clr_context_enabled(ce);
+		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
+		goto unpin;
+	}
 	if (unlikely(atomic_add_unless(&ce->pin_count, -2, 2))) {
 		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
 		return;
@@ -1513,6 +1624,8 @@  static const struct intel_context_ops guc_context_ops = {
 	.unpin = guc_context_unpin,
 	.post_unpin = guc_context_post_unpin,
 
+	.ban = guc_context_ban,
+
 	.enter = intel_context_enter_engine,
 	.exit = intel_context_exit_engine,
 
@@ -1706,6 +1819,8 @@  static const struct intel_context_ops virtual_guc_context_ops = {
 	.unpin = guc_context_unpin,
 	.post_unpin = guc_context_post_unpin,
 
+	.ban = guc_context_ban,
+
 	.enter = guc_virtual_context_enter,
 	.exit = guc_virtual_context_exit,
 
@@ -2159,6 +2274,8 @@  int intel_guc_sched_done_process_msg(struct intel_guc *guc,
 	if (context_pending_enable(ce)) {
 		clr_context_pending_enable(ce);
 	} else if (context_pending_disable(ce)) {
+		bool banned;
+
 		/*
 		 * Unpin must be done before __guc_signal_context_fence,
 		 * otherwise a race exists between the requests getting
@@ -2169,9 +2286,16 @@  int intel_guc_sched_done_process_msg(struct intel_guc *guc,
 		intel_context_sched_disable_unpin(ce);
 
 		spin_lock_irqsave(&ce->guc_state.lock, flags);
+		banned = context_banned(ce);
+		clr_context_banned(ce);
 		clr_context_pending_disable(ce);
 		__guc_signal_context_fence(ce);
 		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
+
+		if (banned) {
+			guc_cancel_context_requests(ce);
+			intel_engine_signal_breadcrumbs(ce->engine);
+		}
 	}
 
 	decr_outstanding_submission_g2h(guc);
@@ -2206,8 +2330,11 @@  static void guc_handle_context_reset(struct intel_guc *guc,
 				     struct intel_context *ce)
 {
 	trace_intel_context_reset(ce);
-	capture_error_state(guc, ce);
-	guc_context_replay(ce);
+
+	if (likely(!intel_context_is_banned(ce))) {
+		capture_error_state(guc, ce);
+		guc_context_replay(ce);
+	}
 }
 
 int intel_guc_context_reset_process_msg(struct intel_guc *guc,
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index c095c4d39456..937d3706af9b 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -934,6 +934,11 @@  DEFINE_EVENT(intel_context, intel_context_reset,
 	     TP_ARGS(ce)
 );
 
+DEFINE_EVENT(intel_context, intel_context_ban,
+	     TP_PROTO(struct intel_context *ce),
+	     TP_ARGS(ce)
+);
+
 DEFINE_EVENT(intel_context, intel_context_register,
 	     TP_PROTO(struct intel_context *ce),
 	     TP_ARGS(ce)
@@ -1036,6 +1041,11 @@  trace_intel_context_reset(struct intel_context *ce)
 {
 }
 
+static inline void
+trace_intel_context_ban(struct intel_context *ce)
+{
+}
+
 static inline void
 trace_intel_context_register(struct intel_context *ce)
 {