diff mbox

[05/21] drm/i915: Separate GPU hang waitqueue from advance

Message ID 1464970133-29859-6-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson June 3, 2016, 4:08 p.m. UTC
Currently __i915_wait_request uses a per-engine wait_queue_t for the dual
purpose of waking after the GPU advances or for waking after an error.
In the future, we may add even more wake sources and require greater
separation, but for now we can conceptually simplify wakeups by separating
the two sources. In particular, this allows us to use different wait-queues
(e.g. one on the engine advancement, a global one for errors and one on
each requests) without any hassle.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h |  6 ++++++
 drivers/gpu/drm/i915/i915_gem.c |  5 +++++
 drivers/gpu/drm/i915/i915_irq.c | 19 ++++---------------
 3 files changed, 15 insertions(+), 15 deletions(-)

Comments

Tvrtko Ursulin June 6, 2016, 1 p.m. UTC | #1
On 03/06/16 17:08, Chris Wilson wrote:
> Currently __i915_wait_request uses a per-engine wait_queue_t for the dual
> purpose of waking after the GPU advances or for waking after an error.
> In the future, we may add even more wake sources and require greater
> separation, but for now we can conceptually simplify wakeups by separating
> the two sources. In particular, this allows us to use different wait-queues
> (e.g. one on the engine advancement, a global one for errors and one on
> each requests) without any hassle.

+ Arun

I think this will conflict with the TDR work where one of the features 
is to make reset handling per engine. So I am not sure how beneficial in 
general, or painful for the TDR series, this patch might be.

Regards,

Tvrtko

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.h |  6 ++++++
>   drivers/gpu/drm/i915/i915_gem.c |  5 +++++
>   drivers/gpu/drm/i915/i915_irq.c | 19 ++++---------------
>   3 files changed, 15 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index ceccc6d6b119..e399e97965e0 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1401,6 +1401,12 @@ struct i915_gpu_error {
>   #define I915_WEDGED			(1 << 31)
>
>   	/**
> +	 * Waitqueue to signal when a hang is detected. Used to for waiters
> +	 * to release the struct_mutex for the reset to procede.
> +	 */
> +	wait_queue_head_t wait_queue;
> +
> +	/**
>   	 * Waitqueue to signal when the reset has completed. Used by clients
>   	 * that wait for dev_priv->mm.wedged to settle.
>   	 */
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 03256f096ab6..de4fb39312a4 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1234,6 +1234,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   	const bool irq_test_in_progress =
>   		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_engine_flag(engine);
>   	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
> +	DEFINE_WAIT(reset);
>   	DEFINE_WAIT(wait);
>   	unsigned long timeout_expire;
>   	s64 before = 0; /* Only to silence a compiler warning. */
> @@ -1278,6 +1279,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   		goto out;
>   	}
>
> +	add_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
>   	for (;;) {
>   		struct timer_list timer;
>
> @@ -1329,6 +1331,8 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			destroy_timer_on_stack(&timer);
>   		}
>   	}
> +	remove_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
> +
>   	if (!irq_test_in_progress)
>   		engine->irq_put(engine);
>
> @@ -5026,6 +5030,7 @@ i915_gem_load_init(struct drm_device *dev)
>   			  i915_gem_retire_work_handler);
>   	INIT_DELAYED_WORK(&dev_priv->mm.idle_work,
>   			  i915_gem_idle_work_handler);
> +	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
>   	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
>
>   	dev_priv->relative_constants_mode = I915_EXEC_CONSTANTS_REL_GENERAL;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 83cab14639b2..30127b94f26e 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2488,11 +2488,8 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>   	return ret;
>   }
>
> -static void i915_error_wake_up(struct drm_i915_private *dev_priv,
> -			       bool reset_completed)
> +static void i915_error_wake_up(struct drm_i915_private *dev_priv)
>   {
> -	struct intel_engine_cs *engine;
> -
>   	/*
>   	 * Notify all waiters for GPU completion events that reset state has
>   	 * been changed, and that they need to restart their wait after
> @@ -2501,18 +2498,10 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>   	 */
>
>   	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
> -	for_each_engine(engine, dev_priv)
> -		wake_up_all(&engine->irq_queue);
> +	wake_up_all(&dev_priv->gpu_error.wait_queue);
>
>   	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
>   	wake_up_all(&dev_priv->pending_flip_queue);
> -
> -	/*
> -	 * Signal tasks blocked in i915_gem_wait_for_error that the pending
> -	 * reset state is cleared.
> -	 */
> -	if (reset_completed)
> -		wake_up_all(&dev_priv->gpu_error.reset_queue);
>   }
>
>   /**
> @@ -2577,7 +2566,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>   		 * Note: The wake_up also serves as a memory barrier so that
>   		 * waiters see the update value of the reset counter atomic_t.
>   		 */
> -		i915_error_wake_up(dev_priv, true);
> +		wake_up_all(&dev_priv->gpu_error.reset_queue);
>   	}
>   }
>
> @@ -2713,7 +2702,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
>   		 * ensure that the waiters see the updated value of the reset
>   		 * counter atomic_t.
>   		 */
> -		i915_error_wake_up(dev_priv, false);
> +		i915_error_wake_up(dev_priv);
>   	}
>
>   	i915_reset_and_wakeup(dev_priv);
>
arun.siluvery@linux.intel.com June 7, 2016, 12:11 p.m. UTC | #2
On 06/06/2016 18:30, Tvrtko Ursulin wrote:
>
> On 03/06/16 17:08, Chris Wilson wrote:
>> Currently __i915_wait_request uses a per-engine wait_queue_t for the dual
>> purpose of waking after the GPU advances or for waking after an error.
>> In the future, we may add even more wake sources and require greater
>> separation, but for now we can conceptually simplify wakeups by
>> separating
>> the two sources. In particular, this allows us to use different
>> wait-queues
>> (e.g. one on the engine advancement, a global one for errors and one on
>> each requests) without any hassle.
>
> + Arun
>
> I think this will conflict with the TDR work where one of the features
> is to make reset handling per engine. So I am not sure how beneficial in
> general, or painful for the TDR series, this patch might be.

Thanks Tvrtko.
Chris has give some comments on a related tdr patch based on these 
changes. I am looking into how to update my changes based on this.

Tdr code need access to struct_mutex so if a waiter is holding it then 
we should be able to ask it to try again so that we can proceed with 
recovery, similarly when an engine reset is in progress.

regards
Arun

>
> Regards,
>
> Tvrtko
>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h |  6 ++++++
>>   drivers/gpu/drm/i915/i915_gem.c |  5 +++++
>>   drivers/gpu/drm/i915/i915_irq.c | 19 ++++---------------
>>   3 files changed, 15 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index ceccc6d6b119..e399e97965e0 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1401,6 +1401,12 @@ struct i915_gpu_error {
>>   #define I915_WEDGED            (1 << 31)
>>
>>       /**
>> +     * Waitqueue to signal when a hang is detected. Used to for waiters
>> +     * to release the struct_mutex for the reset to procede.
>> +     */
>> +    wait_queue_head_t wait_queue;
>> +
>> +    /**
>>        * Waitqueue to signal when the reset has completed. Used by
>> clients
>>        * that wait for dev_priv->mm.wedged to settle.
>>        */
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 03256f096ab6..de4fb39312a4 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1234,6 +1234,7 @@ int __i915_wait_request(struct
>> drm_i915_gem_request *req,
>>       const bool irq_test_in_progress =
>>           ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) &
>> intel_engine_flag(engine);
>>       int state = interruptible ? TASK_INTERRUPTIBLE :
>> TASK_UNINTERRUPTIBLE;
>> +    DEFINE_WAIT(reset);
>>       DEFINE_WAIT(wait);
>>       unsigned long timeout_expire;
>>       s64 before = 0; /* Only to silence a compiler warning. */
>> @@ -1278,6 +1279,7 @@ int __i915_wait_request(struct
>> drm_i915_gem_request *req,
>>           goto out;
>>       }
>>
>> +    add_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
>>       for (;;) {
>>           struct timer_list timer;
>>
>> @@ -1329,6 +1331,8 @@ int __i915_wait_request(struct
>> drm_i915_gem_request *req,
>>               destroy_timer_on_stack(&timer);
>>           }
>>       }
>> +    remove_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
>> +
>>       if (!irq_test_in_progress)
>>           engine->irq_put(engine);
>>
>> @@ -5026,6 +5030,7 @@ i915_gem_load_init(struct drm_device *dev)
>>                 i915_gem_retire_work_handler);
>>       INIT_DELAYED_WORK(&dev_priv->mm.idle_work,
>>                 i915_gem_idle_work_handler);
>> +    init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
>>       init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
>>
>>       dev_priv->relative_constants_mode =
>> I915_EXEC_CONSTANTS_REL_GENERAL;
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c
>> b/drivers/gpu/drm/i915/i915_irq.c
>> index 83cab14639b2..30127b94f26e 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -2488,11 +2488,8 @@ static irqreturn_t gen8_irq_handler(int irq,
>> void *arg)
>>       return ret;
>>   }
>>
>> -static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>> -                   bool reset_completed)
>> +static void i915_error_wake_up(struct drm_i915_private *dev_priv)
>>   {
>> -    struct intel_engine_cs *engine;
>> -
>>       /*
>>        * Notify all waiters for GPU completion events that reset state
>> has
>>        * been changed, and that they need to restart their wait after
>> @@ -2501,18 +2498,10 @@ static void i915_error_wake_up(struct
>> drm_i915_private *dev_priv,
>>        */
>>
>>       /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
>> -    for_each_engine(engine, dev_priv)
>> -        wake_up_all(&engine->irq_queue);
>> +    wake_up_all(&dev_priv->gpu_error.wait_queue);
>>
>>       /* Wake up intel_crtc_wait_for_pending_flips, holding
>> crtc->mutex. */
>>       wake_up_all(&dev_priv->pending_flip_queue);
>> -
>> -    /*
>> -     * Signal tasks blocked in i915_gem_wait_for_error that the pending
>> -     * reset state is cleared.
>> -     */
>> -    if (reset_completed)
>> -        wake_up_all(&dev_priv->gpu_error.reset_queue);
>>   }
>>
>>   /**
>> @@ -2577,7 +2566,7 @@ static void i915_reset_and_wakeup(struct
>> drm_i915_private *dev_priv)
>>            * Note: The wake_up also serves as a memory barrier so that
>>            * waiters see the update value of the reset counter atomic_t.
>>            */
>> -        i915_error_wake_up(dev_priv, true);
>> +        wake_up_all(&dev_priv->gpu_error.reset_queue);
>>       }
>>   }
>>
>> @@ -2713,7 +2702,7 @@ void i915_handle_error(struct drm_i915_private
>> *dev_priv,
>>            * ensure that the waiters see the updated value of the reset
>>            * counter atomic_t.
>>            */
>> -        i915_error_wake_up(dev_priv, false);
>> +        i915_error_wake_up(dev_priv);
>>       }
>>
>>       i915_reset_and_wakeup(dev_priv);
>>
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ceccc6d6b119..e399e97965e0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1401,6 +1401,12 @@  struct i915_gpu_error {
 #define I915_WEDGED			(1 << 31)
 
 	/**
+	 * Waitqueue to signal when a hang is detected. Used to for waiters
+	 * to release the struct_mutex for the reset to procede.
+	 */
+	wait_queue_head_t wait_queue;
+
+	/**
 	 * Waitqueue to signal when the reset has completed. Used by clients
 	 * that wait for dev_priv->mm.wedged to settle.
 	 */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 03256f096ab6..de4fb39312a4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1234,6 +1234,7 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 	const bool irq_test_in_progress =
 		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_engine_flag(engine);
 	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
+	DEFINE_WAIT(reset);
 	DEFINE_WAIT(wait);
 	unsigned long timeout_expire;
 	s64 before = 0; /* Only to silence a compiler warning. */
@@ -1278,6 +1279,7 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 		goto out;
 	}
 
+	add_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
 	for (;;) {
 		struct timer_list timer;
 
@@ -1329,6 +1331,8 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 			destroy_timer_on_stack(&timer);
 		}
 	}
+	remove_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
+
 	if (!irq_test_in_progress)
 		engine->irq_put(engine);
 
@@ -5026,6 +5030,7 @@  i915_gem_load_init(struct drm_device *dev)
 			  i915_gem_retire_work_handler);
 	INIT_DELAYED_WORK(&dev_priv->mm.idle_work,
 			  i915_gem_idle_work_handler);
+	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
 
 	dev_priv->relative_constants_mode = I915_EXEC_CONSTANTS_REL_GENERAL;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 83cab14639b2..30127b94f26e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2488,11 +2488,8 @@  static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return ret;
 }
 
-static void i915_error_wake_up(struct drm_i915_private *dev_priv,
-			       bool reset_completed)
+static void i915_error_wake_up(struct drm_i915_private *dev_priv)
 {
-	struct intel_engine_cs *engine;
-
 	/*
 	 * Notify all waiters for GPU completion events that reset state has
 	 * been changed, and that they need to restart their wait after
@@ -2501,18 +2498,10 @@  static void i915_error_wake_up(struct drm_i915_private *dev_priv,
 	 */
 
 	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
-	for_each_engine(engine, dev_priv)
-		wake_up_all(&engine->irq_queue);
+	wake_up_all(&dev_priv->gpu_error.wait_queue);
 
 	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
 	wake_up_all(&dev_priv->pending_flip_queue);
-
-	/*
-	 * Signal tasks blocked in i915_gem_wait_for_error that the pending
-	 * reset state is cleared.
-	 */
-	if (reset_completed)
-		wake_up_all(&dev_priv->gpu_error.reset_queue);
 }
 
 /**
@@ -2577,7 +2566,7 @@  static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 		 * Note: The wake_up also serves as a memory barrier so that
 		 * waiters see the update value of the reset counter atomic_t.
 		 */
-		i915_error_wake_up(dev_priv, true);
+		wake_up_all(&dev_priv->gpu_error.reset_queue);
 	}
 }
 
@@ -2713,7 +2702,7 @@  void i915_handle_error(struct drm_i915_private *dev_priv,
 		 * ensure that the waiters see the updated value of the reset
 		 * counter atomic_t.
 		 */
-		i915_error_wake_up(dev_priv, false);
+		i915_error_wake_up(dev_priv);
 	}
 
 	i915_reset_and_wakeup(dev_priv);