diff mbox series

[V2] drm/i915/gt: Fix a lockdep warning on RT kernel

Message ID 20210414144828.22813-1-jun.miao@windriver.com (mailing list archive)
State New
Headers show
Series [V2] drm/i915/gt: Fix a lockdep warning on RT kernel | expand

Commit Message

Jun Miao April 14, 2021, 2:48 p.m. UTC
Don`t simple disable all the HD-irq, should race the region in the
intel_breadcrumbs_disarm_irq() only.

BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:969
  #0: ffff89c4c00ca970 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x1cf/0x6d0
  #1: ffffa433c1f53e60 ((work_completion)(&engine->retire_work)){+.+.}-{0:0}, at: process_one_work+0x1cf 0x6d
  #2: ffff89c4ccb0a0a8 (kernel_context){+.+.}-{0:0}, at: engine_retire+0x62/0x110 [i915]
  #3: ffff89c4cf682300 (wakeref.mutex#3){+.+.}-{0:0}, at: __intel_wakeref_put_last+0x20/0x60 [i915]
  #4: ffff89c4ccb08398 (&b->irq_lock){+.+.}-{0:0}, at: intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
 irq event stamp: 2126
 hardirqs last  enabled at (2125): [<ffffffffbb134739>] cancel_delayed_work+0xa9/0xc0
 hardirqs last disabled at (2126): [<ffffffffc0507fe6>] __intel_breadcrumbs_park+0x76/0x80 [i915]
 softirqs last  enabled at (0): [<ffffffffbb1099ce>] copy_process+0x63e/0x1630
 softirqs last disabled at (0): [<0000000000000000>] 0x0
 CPU: 3 PID: 281 Comm: kworker/3:3 Not tainted 5.10.27-rt34-yocto-preempt-rt #1
 Hardware name: Intel(R) Client Systems NUC7i5DNKE/NUC7i5DNB, BIOS DNKBLi5v.86A.0064.2019.0523.1933 05/23 2019
 Workqueue: events engine_retire [i915]
 Call Trace:
  show_stack+0x52/0x58
  dump_stack+0x7d/0x9f
  ___might_sleep.cold+0xe3/0xf4
  rt_spin_lock+0x3f/0xc0
  ? intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
  intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
  signal_irq_work+0x241/0x660 [i915]
  ? __this_cpu_preempt_check+0x13/0x20
  ? lockdep_hardirqs_off+0x106/0x120
  __intel_breadcrumbs_park+0x3f/0x80 [i915]
  __engine_park+0xbd/0xe0 [i915]
  ____intel_wakeref_put_last+0x22/0x60 [i915]
  __intel_wakeref_put_last+0x50/0x60 [i915]
  intel_context_exit_engine+0x5f/0x70 [i915]
  i915_request_retire+0x139/0x2d0 [i915]
  engine_retire+0xb0/0x110 [i915]
  process_one_work+0x26d/0x6d0
  worker_thread+0x53/0x330
  kthread+0x1b0/0x1d0
  ? process_one_work+0x6d0/0x6d0
  ? __kthread_parkme+0xc0/0xc0
  ret_from_fork+0x22/0x30

Fixes: 9d5612ca165a ("drm/i915/gt: Defer enabling the breadcrumb interrupt to after submission")
Signed-off-by: Jun Miao <jun.miao@windriver.com>
---
 drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Comments

Tvrtko Ursulin April 15, 2021, 11:12 a.m. UTC | #1
Hi,

On 14/04/2021 15:48, Jun Miao wrote:
> Don`t simple disable all the HD-irq, should race the region in the
> intel_breadcrumbs_disarm_irq() only.
> 

What is HD-irq, I am, not familiar with that term?

> BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:969
>    #0: ffff89c4c00ca970 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x1cf/0x6d0
>    #1: ffffa433c1f53e60 ((work_completion)(&engine->retire_work)){+.+.}-{0:0}, at: process_one_work+0x1cf 0x6d
>    #2: ffff89c4ccb0a0a8 (kernel_context){+.+.}-{0:0}, at: engine_retire+0x62/0x110 [i915]
>    #3: ffff89c4cf682300 (wakeref.mutex#3){+.+.}-{0:0}, at: __intel_wakeref_put_last+0x20/0x60 [i915]
>    #4: ffff89c4ccb08398 (&b->irq_lock){+.+.}-{0:0}, at: intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
>   irq event stamp: 2126
>   hardirqs last  enabled at (2125): [<ffffffffbb134739>] cancel_delayed_work+0xa9/0xc0
>   hardirqs last disabled at (2126): [<ffffffffc0507fe6>] __intel_breadcrumbs_park+0x76/0x80 [i915]
>   softirqs last  enabled at (0): [<ffffffffbb1099ce>] copy_process+0x63e/0x1630
>   softirqs last disabled at (0): [<0000000000000000>] 0x0
>   CPU: 3 PID: 281 Comm: kworker/3:3 Not tainted 5.10.27-rt34-yocto-preempt-rt #1
>   Hardware name: Intel(R) Client Systems NUC7i5DNKE/NUC7i5DNB, BIOS DNKBLi5v.86A.0064.2019.0523.1933 05/23 2019
>   Workqueue: events engine_retire [i915]
>   Call Trace:
>    show_stack+0x52/0x58
>    dump_stack+0x7d/0x9f
>    ___might_sleep.cold+0xe3/0xf4
>    rt_spin_lock+0x3f/0xc0
>    ? intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
>    intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
>    signal_irq_work+0x241/0x660 [i915]
>    ? __this_cpu_preempt_check+0x13/0x20
>    ? lockdep_hardirqs_off+0x106/0x120
>    __intel_breadcrumbs_park+0x3f/0x80 [i915]
>    __engine_park+0xbd/0xe0 [i915]
>    ____intel_wakeref_put_last+0x22/0x60 [i915]
>    __intel_wakeref_put_last+0x50/0x60 [i915]
>    intel_context_exit_engine+0x5f/0x70 [i915]
>    i915_request_retire+0x139/0x2d0 [i915]
>    engine_retire+0xb0/0x110 [i915]
>    process_one_work+0x26d/0x6d0
>    worker_thread+0x53/0x330
>    kthread+0x1b0/0x1d0
>    ? process_one_work+0x6d0/0x6d0
>    ? __kthread_parkme+0xc0/0xc0
>    ret_from_fork+0x22/0x30
> 
> Fixes: 9d5612ca165a ("drm/i915/gt: Defer enabling the breadcrumb interrupt to after submission")
> Signed-off-by: Jun Miao <jun.miao@windriver.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 8 ++++----
>   1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> index 34a645d..0589b1a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> @@ -103,10 +103,12 @@ static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
>   
>   static void intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
>   {
> -	spin_lock(&b->irq_lock);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&b->irq_lock, flags);
>   	if (b->irq_armed)
>   		__intel_breadcrumbs_disarm_irq(b);
> -	spin_unlock(&b->irq_lock);
> +	spin_unlock_irqrestore(&b->irq_lock, flags);
>   }
>   
>   static void add_signaling_context(struct intel_breadcrumbs *b,
> @@ -337,9 +339,7 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
>   	/* Kick the work once more to drain the signalers, and disarm the irq */
>   	irq_work_sync(&b->irq_work);
>   	while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
> -		local_irq_disable();
>   		signal_irq_work(&b->irq_work);
> -		local_irq_enable();

Unfortunately there is another lock inside signal_irq_work (rq->lock) 
which needs to be taken irq safe.

RT patches are in tree or out of the tree these days?

Regards,

Tvrtko

>   		cond_resched();
>   	}
>   }
>
Jun Miao April 15, 2021, 12:41 p.m. UTC | #2
On 4/15/21 7:12 PM, Tvrtko Ursulin wrote:
> [Please note: This e-mail is from an EXTERNAL e-mail address]
>
> Hi,
>
> On 14/04/2021 15:48, Jun Miao wrote:
>> Don`t simple disable all the HD-irq, should race the region in the
>> intel_breadcrumbs_disarm_irq() only.
>>
>
> What is HD-irq, I am, not familiar with that term?

Disable local interrupt delivery from Hardware of cpu.:-)

Thanks,

Jun

>
>> BUG: sleeping function called from invalid context at 
>> kernel/locking/rtmutex.c:969
>>    #0: ffff89c4c00ca970 ((wq_completion)events){+.+.}-{0:0}, at: 
>> process_one_work+0x1cf/0x6d0
>>    #1: ffffa433c1f53e60 
>> ((work_completion)(&engine->retire_work)){+.+.}-{0:0}, at: 
>> process_one_work+0x1cf 0x6d
>>    #2: ffff89c4ccb0a0a8 (kernel_context){+.+.}-{0:0}, at: 
>> engine_retire+0x62/0x110 [i915]
>>    #3: ffff89c4cf682300 (wakeref.mutex#3){+.+.}-{0:0}, at: 
>> __intel_wakeref_put_last+0x20/0x60 [i915]
>>    #4: ffff89c4ccb08398 (&b->irq_lock){+.+.}-{0:0}, at: 
>> intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
>>   irq event stamp: 2126
>>   hardirqs last  enabled at (2125): [<ffffffffbb134739>] 
>> cancel_delayed_work+0xa9/0xc0
>>   hardirqs last disabled at (2126): [<ffffffffc0507fe6>] 
>> __intel_breadcrumbs_park+0x76/0x80 [i915]
>>   softirqs last  enabled at (0): [<ffffffffbb1099ce>] 
>> copy_process+0x63e/0x1630
>>   softirqs last disabled at (0): [<0000000000000000>] 0x0
>>   CPU: 3 PID: 281 Comm: kworker/3:3 Not tainted 
>> 5.10.27-rt34-yocto-preempt-rt #1
>>   Hardware name: Intel(R) Client Systems NUC7i5DNKE/NUC7i5DNB, BIOS 
>> DNKBLi5v.86A.0064.2019.0523.1933 05/23 2019
>>   Workqueue: events engine_retire [i915]
>>   Call Trace:
>>    show_stack+0x52/0x58
>>    dump_stack+0x7d/0x9f
>>    ___might_sleep.cold+0xe3/0xf4
>>    rt_spin_lock+0x3f/0xc0
>>    ? intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
>>    intel_breadcrumbs_disarm_irq+0x20/0xd0 [i915]
>>    signal_irq_work+0x241/0x660 [i915]
>>    ? __this_cpu_preempt_check+0x13/0x20
>>    ? lockdep_hardirqs_off+0x106/0x120
>>    __intel_breadcrumbs_park+0x3f/0x80 [i915]
>>    __engine_park+0xbd/0xe0 [i915]
>>    ____intel_wakeref_put_last+0x22/0x60 [i915]
>>    __intel_wakeref_put_last+0x50/0x60 [i915]
>>    intel_context_exit_engine+0x5f/0x70 [i915]
>>    i915_request_retire+0x139/0x2d0 [i915]
>>    engine_retire+0xb0/0x110 [i915]
>>    process_one_work+0x26d/0x6d0
>>    worker_thread+0x53/0x330
>>    kthread+0x1b0/0x1d0
>>    ? process_one_work+0x6d0/0x6d0
>>    ? __kthread_parkme+0xc0/0xc0
>>    ret_from_fork+0x22/0x30
>>
>> Fixes: 9d5612ca165a ("drm/i915/gt: Defer enabling the breadcrumb 
>> interrupt to after submission")
>> Signed-off-by: Jun Miao <jun.miao@windriver.com>
>> ---
>>   drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 8 ++++----
>>   1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c 
>> b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
>> index 34a645d..0589b1a 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
>> @@ -103,10 +103,12 @@ static void 
>> __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
>>
>>   static void intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
>>   {
>> -     spin_lock(&b->irq_lock);
>> +     unsigned long flags;
>> +
>> +     spin_lock_irqsave(&b->irq_lock, flags);
>>       if (b->irq_armed)
>>               __intel_breadcrumbs_disarm_irq(b);
>> -     spin_unlock(&b->irq_lock);
>> +     spin_unlock_irqrestore(&b->irq_lock, flags);
>>   }
>>
>>   static void add_signaling_context(struct intel_breadcrumbs *b,
>> @@ -337,9 +339,7 @@ void __intel_breadcrumbs_park(struct 
>> intel_breadcrumbs *b)
>>       /* Kick the work once more to drain the signalers, and disarm 
>> the irq */
>>       irq_work_sync(&b->irq_work);
>>       while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
>> -             local_irq_disable();
>>               signal_irq_work(&b->irq_work);
>> -             local_irq_enable();
>
> Unfortunately there is another lock inside signal_irq_work (rq->lock)
> which needs to be taken irq safe.
>
Ok, i will change the left spin_lock -> spin_lock_irqsave.

In fact,  inside signal_irq_work,  intel_breadcrumbs_arm_irq 
(&b->irq_lock)  which also needs to be taken irq safe.

Thanks,

Jun

> RT patches are in tree or out of the tree these days?

I base on the mainline kernel tree, and this BUG warning will not 
happen.  But RT v5.10 will complain "BUG warning", so i want this patch 
will solve RT WARNING without affecting mainline performance in mainline 
tree.

Thanks,

Jun


>
> Regards,
>
> Tvrtko
>
>>               cond_resched();
>>       }
>>   }
>>
Tvrtko Ursulin April 19, 2021, 8:57 a.m. UTC | #3
On 15/04/2021 13:41, jun.miao wrote:
> 
> On 4/15/21 7:12 PM, Tvrtko Ursulin wrote:
>> [Please note: This e-mail is from an EXTERNAL e-mail address]
>>
>> Hi,
>>
>> On 14/04/2021 15:48, Jun Miao wrote:
>>> Don`t simple disable all the HD-irq, should race the region in the
>>> intel_breadcrumbs_disarm_irq() only.
>>>
>>
>> What is HD-irq, I am, not familiar with that term?
> 
> Disable local interrupt delivery from Hardware of cpu.:-)

HW then, not HD. ;)

[...]

>>>   static void add_signaling_context(struct intel_breadcrumbs *b,
>>> @@ -337,9 +339,7 @@ void __intel_breadcrumbs_park(struct 
>>> intel_breadcrumbs *b)
>>>       /* Kick the work once more to drain the signalers, and disarm 
>>> the irq */
>>>       irq_work_sync(&b->irq_work);
>>>       while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
>>> -             local_irq_disable();
>>>               signal_irq_work(&b->irq_work);
>>> -             local_irq_enable();
>>
>> Unfortunately there is another lock inside signal_irq_work (rq->lock)
>> which needs to be taken irq safe.
>>
> Ok, i will change the left spin_lock -> spin_lock_irqsave.
> 
> In fact,  inside signal_irq_work,  intel_breadcrumbs_arm_irq 
> (&b->irq_lock)  which also needs to be taken irq safe.
> 
> Thanks,
> 
> Jun
> 
>> RT patches are in tree or out of the tree these days?
> 
> I base on the mainline kernel tree, and this BUG warning will not 
> happen.  But RT v5.10 will complain "BUG warning", so i want this patch 
> will solve RT WARNING without affecting mainline performance in mainline 
> tree.

So the problem is we did not typically do changes to cater for out of 
tree stuff, unless they are really minimal. And this one in my view does 
not quite qualify as such.

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index 34a645d..0589b1a 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -103,10 +103,12 @@  static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
 
 static void intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
 {
-	spin_lock(&b->irq_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&b->irq_lock, flags);
 	if (b->irq_armed)
 		__intel_breadcrumbs_disarm_irq(b);
-	spin_unlock(&b->irq_lock);
+	spin_unlock_irqrestore(&b->irq_lock, flags);
 }
 
 static void add_signaling_context(struct intel_breadcrumbs *b,
@@ -337,9 +339,7 @@  void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
 	/* Kick the work once more to drain the signalers, and disarm the irq */
 	irq_work_sync(&b->irq_work);
 	while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
-		local_irq_disable();
 		signal_irq_work(&b->irq_work);
-		local_irq_enable();
 		cond_resched();
 	}
 }