diff mbox series

[06/14] drm/i915: Cancel retire_worker on parking

Message ID 20190501114541.10077-6-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/14] drm/i915/hangcheck: Track context changes | expand

Commit Message

Chris Wilson May 1, 2019, 11:45 a.m. UTC
Replace the racy continuation check within retire_work with a definite
kill-switch on idling. The race was being exposed by gem_concurrent_blit
where the retire_worker would be terminated too early leaving us
spinning in debugfs/i915_drop_caches with nothing flushing the
retirement queue.

Although that the igt is trying to idle from one child while submitting
from another may be a contributing factor as to why  it runs so slowly...

Testcase: igt/gem_concurrent_blit
Fixes: 79ffac8599c4 ("drm/i915: Invert the GEM wakeref hierarchy")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_pm.c             | 18 ++++++++++++------
 .../gpu/drm/i915/selftests/mock_gem_device.c   |  1 -
 2 files changed, 12 insertions(+), 7 deletions(-)

Comments

Tvrtko Ursulin May 2, 2019, 1:29 p.m. UTC | #1
On 01/05/2019 12:45, Chris Wilson wrote:
> Replace the racy continuation check within retire_work with a definite
> kill-switch on idling. The race was being exposed by gem_concurrent_blit
> where the retire_worker would be terminated too early leaving us
> spinning in debugfs/i915_drop_caches with nothing flushing the
> retirement queue.
> 
> Although that the igt is trying to idle from one child while submitting
> from another may be a contributing factor as to why  it runs so slowly...
> 
> Testcase: igt/gem_concurrent_blit
> Fixes: 79ffac8599c4 ("drm/i915: Invert the GEM wakeref hierarchy")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_pm.c             | 18 ++++++++++++------
>   .../gpu/drm/i915/selftests/mock_gem_device.c   |  1 -
>   2 files changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_pm.c b/drivers/gpu/drm/i915/i915_gem_pm.c
> index ae91ad7cb31e..b239b55f84cd 100644
> --- a/drivers/gpu/drm/i915/i915_gem_pm.c
> +++ b/drivers/gpu/drm/i915/i915_gem_pm.c
> @@ -30,15 +30,23 @@ static void idle_work_handler(struct work_struct *work)
>   {
>   	struct drm_i915_private *i915 =
>   		container_of(work, typeof(*i915), gem.idle_work);
> +	bool restart = true;
>   
> +	cancel_delayed_work_sync(&i915->gem.retire_work);
>   	mutex_lock(&i915->drm.struct_mutex);
>   

You don't want to run another retire here? Since the retire worker might 
have just been canceled I thought you should.

Regards,

Tvrtko

>   	intel_wakeref_lock(&i915->gt.wakeref);
> -	if (!intel_wakeref_active(&i915->gt.wakeref) && !work_pending(work))
> +	if (!intel_wakeref_active(&i915->gt.wakeref) && !work_pending(work)) {
>   		i915_gem_park(i915);
> +		restart = false;
> +	}
>   	intel_wakeref_unlock(&i915->gt.wakeref);
>   
>   	mutex_unlock(&i915->drm.struct_mutex);
> +	if (restart)
> +		queue_delayed_work(i915->wq,
> +				   &i915->gem.retire_work,
> +				   round_jiffies_up_relative(HZ));
>   }
>   
>   static void retire_work_handler(struct work_struct *work)
> @@ -52,10 +60,9 @@ static void retire_work_handler(struct work_struct *work)
>   		mutex_unlock(&i915->drm.struct_mutex);
>   	}
>   
> -	if (intel_wakeref_active(&i915->gt.wakeref))
> -		queue_delayed_work(i915->wq,
> -				   &i915->gem.retire_work,
> -				   round_jiffies_up_relative(HZ));
> +	queue_delayed_work(i915->wq,
> +			   &i915->gem.retire_work,
> +			   round_jiffies_up_relative(HZ));
>   }
>   
>   static int pm_notifier(struct notifier_block *nb,
> @@ -140,7 +147,6 @@ void i915_gem_suspend(struct drm_i915_private *i915)
>   	 * Assert that we successfully flushed all the work and
>   	 * reset the GPU back to its idle, low power state.
>   	 */
> -	drain_delayed_work(&i915->gem.retire_work);
>   	GEM_BUG_ON(i915->gt.awake);
>   	flush_work(&i915->gem.idle_work);
>   
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index d919f512042c..9fd02025d382 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -58,7 +58,6 @@ static void mock_device_release(struct drm_device *dev)
>   	i915_gem_contexts_lost(i915);
>   	mutex_unlock(&i915->drm.struct_mutex);
>   
> -	drain_delayed_work(&i915->gem.retire_work);
>   	flush_work(&i915->gem.idle_work);
>   	i915_gem_drain_workqueue(i915);
>   
>
Chris Wilson May 2, 2019, 1:33 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-05-02 14:29:50)
> 
> On 01/05/2019 12:45, Chris Wilson wrote:
> > Replace the racy continuation check within retire_work with a definite
> > kill-switch on idling. The race was being exposed by gem_concurrent_blit
> > where the retire_worker would be terminated too early leaving us
> > spinning in debugfs/i915_drop_caches with nothing flushing the
> > retirement queue.
> > 
> > Although that the igt is trying to idle from one child while submitting
> > from another may be a contributing factor as to why  it runs so slowly...
> > 
> > Testcase: igt/gem_concurrent_blit
> > Fixes: 79ffac8599c4 ("drm/i915: Invert the GEM wakeref hierarchy")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_gem_pm.c             | 18 ++++++++++++------
> >   .../gpu/drm/i915/selftests/mock_gem_device.c   |  1 -
> >   2 files changed, 12 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_pm.c b/drivers/gpu/drm/i915/i915_gem_pm.c
> > index ae91ad7cb31e..b239b55f84cd 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_pm.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_pm.c
> > @@ -30,15 +30,23 @@ static void idle_work_handler(struct work_struct *work)
> >   {
> >       struct drm_i915_private *i915 =
> >               container_of(work, typeof(*i915), gem.idle_work);
> > +     bool restart = true;
> >   
> > +     cancel_delayed_work_sync(&i915->gem.retire_work);
> >       mutex_lock(&i915->drm.struct_mutex);
> >   
> 
> You don't want to run another retire here? Since the retire worker might 
> have just been canceled I thought you should.

Why though? If there are retires outstanding, we won't sleep and want to
defer parking until after the next cycle.
-Chris
Tvrtko Ursulin May 2, 2019, 2:20 p.m. UTC | #3
On 02/05/2019 14:33, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-05-02 14:29:50)
>>
>> On 01/05/2019 12:45, Chris Wilson wrote:
>>> Replace the racy continuation check within retire_work with a definite
>>> kill-switch on idling. The race was being exposed by gem_concurrent_blit
>>> where the retire_worker would be terminated too early leaving us
>>> spinning in debugfs/i915_drop_caches with nothing flushing the
>>> retirement queue.
>>>
>>> Although that the igt is trying to idle from one child while submitting
>>> from another may be a contributing factor as to why  it runs so slowly...
>>>
>>> Testcase: igt/gem_concurrent_blit
>>> Fixes: 79ffac8599c4 ("drm/i915: Invert the GEM wakeref hierarchy")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_gem_pm.c             | 18 ++++++++++++------
>>>    .../gpu/drm/i915/selftests/mock_gem_device.c   |  1 -
>>>    2 files changed, 12 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_pm.c b/drivers/gpu/drm/i915/i915_gem_pm.c
>>> index ae91ad7cb31e..b239b55f84cd 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_pm.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_pm.c
>>> @@ -30,15 +30,23 @@ static void idle_work_handler(struct work_struct *work)
>>>    {
>>>        struct drm_i915_private *i915 =
>>>                container_of(work, typeof(*i915), gem.idle_work);
>>> +     bool restart = true;
>>>    
>>> +     cancel_delayed_work_sync(&i915->gem.retire_work);
>>>        mutex_lock(&i915->drm.struct_mutex);
>>>    
>>
>> You don't want to run another retire here? Since the retire worker might
>> have just been canceled I thought you should.
> 
> Why though? If there are retires outstanding, we won't sleep and want to
> defer parking until after the next cycle.

In this case what is the point of cancel_delayed_work_*sync* and not 
just the async cancel?

Regards,

Tvrtko
Chris Wilson May 2, 2019, 2:26 p.m. UTC | #4
Quoting Tvrtko Ursulin (2019-05-02 15:20:52)
> 
> On 02/05/2019 14:33, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-05-02 14:29:50)
> >>
> >> On 01/05/2019 12:45, Chris Wilson wrote:
> >>> Replace the racy continuation check within retire_work with a definite
> >>> kill-switch on idling. The race was being exposed by gem_concurrent_blit
> >>> where the retire_worker would be terminated too early leaving us
> >>> spinning in debugfs/i915_drop_caches with nothing flushing the
> >>> retirement queue.
> >>>
> >>> Although that the igt is trying to idle from one child while submitting
> >>> from another may be a contributing factor as to why  it runs so slowly...
> >>>
> >>> Testcase: igt/gem_concurrent_blit
> >>> Fixes: 79ffac8599c4 ("drm/i915: Invert the GEM wakeref hierarchy")
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>> ---
> >>>    drivers/gpu/drm/i915/i915_gem_pm.c             | 18 ++++++++++++------
> >>>    .../gpu/drm/i915/selftests/mock_gem_device.c   |  1 -
> >>>    2 files changed, 12 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/i915_gem_pm.c b/drivers/gpu/drm/i915/i915_gem_pm.c
> >>> index ae91ad7cb31e..b239b55f84cd 100644
> >>> --- a/drivers/gpu/drm/i915/i915_gem_pm.c
> >>> +++ b/drivers/gpu/drm/i915/i915_gem_pm.c
> >>> @@ -30,15 +30,23 @@ static void idle_work_handler(struct work_struct *work)
> >>>    {
> >>>        struct drm_i915_private *i915 =
> >>>                container_of(work, typeof(*i915), gem.idle_work);
> >>> +     bool restart = true;
> >>>    
> >>> +     cancel_delayed_work_sync(&i915->gem.retire_work);
> >>>        mutex_lock(&i915->drm.struct_mutex);
> >>>    
> >>
> >> You don't want to run another retire here? Since the retire worker might
> >> have just been canceled I thought you should.
> > 
> > Why though? If there are retires outstanding, we won't sleep and want to
> > defer parking until after the next cycle.
> 
> In this case what is the point of cancel_delayed_work_*sync* and not 
> just the async cancel?

There's an non-sync version? Ah ha!
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_pm.c b/drivers/gpu/drm/i915/i915_gem_pm.c
index ae91ad7cb31e..b239b55f84cd 100644
--- a/drivers/gpu/drm/i915/i915_gem_pm.c
+++ b/drivers/gpu/drm/i915/i915_gem_pm.c
@@ -30,15 +30,23 @@  static void idle_work_handler(struct work_struct *work)
 {
 	struct drm_i915_private *i915 =
 		container_of(work, typeof(*i915), gem.idle_work);
+	bool restart = true;
 
+	cancel_delayed_work_sync(&i915->gem.retire_work);
 	mutex_lock(&i915->drm.struct_mutex);
 
 	intel_wakeref_lock(&i915->gt.wakeref);
-	if (!intel_wakeref_active(&i915->gt.wakeref) && !work_pending(work))
+	if (!intel_wakeref_active(&i915->gt.wakeref) && !work_pending(work)) {
 		i915_gem_park(i915);
+		restart = false;
+	}
 	intel_wakeref_unlock(&i915->gt.wakeref);
 
 	mutex_unlock(&i915->drm.struct_mutex);
+	if (restart)
+		queue_delayed_work(i915->wq,
+				   &i915->gem.retire_work,
+				   round_jiffies_up_relative(HZ));
 }
 
 static void retire_work_handler(struct work_struct *work)
@@ -52,10 +60,9 @@  static void retire_work_handler(struct work_struct *work)
 		mutex_unlock(&i915->drm.struct_mutex);
 	}
 
-	if (intel_wakeref_active(&i915->gt.wakeref))
-		queue_delayed_work(i915->wq,
-				   &i915->gem.retire_work,
-				   round_jiffies_up_relative(HZ));
+	queue_delayed_work(i915->wq,
+			   &i915->gem.retire_work,
+			   round_jiffies_up_relative(HZ));
 }
 
 static int pm_notifier(struct notifier_block *nb,
@@ -140,7 +147,6 @@  void i915_gem_suspend(struct drm_i915_private *i915)
 	 * Assert that we successfully flushed all the work and
 	 * reset the GPU back to its idle, low power state.
 	 */
-	drain_delayed_work(&i915->gem.retire_work);
 	GEM_BUG_ON(i915->gt.awake);
 	flush_work(&i915->gem.idle_work);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index d919f512042c..9fd02025d382 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -58,7 +58,6 @@  static void mock_device_release(struct drm_device *dev)
 	i915_gem_contexts_lost(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 
-	drain_delayed_work(&i915->gem.retire_work);
 	flush_work(&i915->gem.idle_work);
 	i915_gem_drain_workqueue(i915);