diff mbox series

drm/sched: Fix kernel NULL pointer dereference error

Message ID 20220930084810.4639-1-Arvind.Yadav@amd.com (mailing list archive)
State New, archived
Headers show
Series drm/sched: Fix kernel NULL pointer dereference error | expand

Commit Message

Arvind Yadav Sept. 30, 2022, 8:48 a.m. UTC
BUG: kernel NULL pointer dereference, address: 0000000000000088
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP NOPTI
 CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1
 Arvind : [dma_fence_default_wait _START] timeout = -1
 Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018
 RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched]
 Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0
 RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087
 RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018
 RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000
 RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
 R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708
 FS:  0000000000000000(0000) GS:ffff8a48a0a80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0
 Call Trace:
  <IRQ>
  drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
  dma_fence_signal_timestamp_locked+0x7e/0x110
  dma_fence_signal+0x31/0x60
  amdgpu_fence_process+0xc4/0x140 [amdgpu]
  gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu]
  amdgpu_irq_dispatch+0xb7/0x210 [amdgpu]
  amdgpu_ih_process+0x86/0x100 [amdgpu]
  amdgpu_irq_handler+0x24/0x60 [amdgpu]
  __handle_irq_event_percpu+0x4b/0x190
  handle_irq_event_percpu+0x15/0x50
  handle_irq_event+0x39/0x60
  handle_edge_irq+0xaf/0x210
  __common_interrupt+0x6e/0x110
  common_interrupt+0xc1/0xe0
  </IRQ>
  <TASK>

Signed-off-by: Arvind Yadav <Arvind.Yadav@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

Comments

Christian König Sept. 30, 2022, 11:26 a.m. UTC | #1
Am 30.09.22 um 10:48 schrieb Arvind Yadav:
> BUG: kernel NULL pointer dereference, address: 0000000000000088
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
>   PGD 0 P4D 0
>   Oops: 0000 [#1] PREEMPT SMP NOPTI
>   CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1
>   Arvind : [dma_fence_default_wait _START] timeout = -1
>   Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018
>   RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched]
>   Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0
>   RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087
>   RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018
>   RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000
>   RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000
>   R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
>   R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708
>   FS:  0000000000000000(0000) GS:ffff8a48a0a80000(0000) knlGS:0000000000000000
>   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>   CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0
>   Call Trace:
>    <IRQ>
>    drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
>    dma_fence_signal_timestamp_locked+0x7e/0x110
>    dma_fence_signal+0x31/0x60
>    amdgpu_fence_process+0xc4/0x140 [amdgpu]
>    gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu]
>    amdgpu_irq_dispatch+0xb7/0x210 [amdgpu]
>    amdgpu_ih_process+0x86/0x100 [amdgpu]
>    amdgpu_irq_handler+0x24/0x60 [amdgpu]
>    __handle_irq_event_percpu+0x4b/0x190
>    handle_irq_event_percpu+0x15/0x50
>    handle_irq_event+0x39/0x60
>    handle_edge_irq+0xaf/0x210
>    __common_interrupt+0x6e/0x110
>    common_interrupt+0xc1/0xe0
>    </IRQ>
>    <TASK>

How is this triggered any why haven't we seen it before?

Christian

> Signed-off-by: Arvind Yadav <Arvind.Yadav@amd.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 7 ++++++-
>   1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 6684d88463b4..390272f6b126 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -172,7 +172,12 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
>   static void drm_sched_job_done(struct drm_sched_job *s_job)
>   {
>   	struct drm_sched_fence *s_fence = s_job->s_fence;
> -	struct drm_gpu_scheduler *sched = s_fence->sched;
> +	struct drm_gpu_scheduler *sched;
> +
> +	if (!s_fence)
> +		return;
> +
> +	sched = s_fence->sched;
>   
>   	atomic_dec(&sched->hw_rq_count);
>   	atomic_dec(sched->score);
Yadav, Arvind Sept. 30, 2022, 3:39 p.m. UTC | #2
On 9/30/2022 4:56 PM, Christian König wrote:
> Am 30.09.22 um 10:48 schrieb Arvind Yadav:
>> BUG: kernel NULL pointer dereference, address: 0000000000000088
>>   #PF: supervisor read access in kernel mode
>>   #PF: error_code(0x0000) - not-present page
>>   PGD 0 P4D 0
>>   Oops: 0000 [#1] PREEMPT SMP NOPTI
>>   CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1
>>   Arvind : [dma_fence_default_wait _START] timeout = -1
>>   Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018
>>   RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched]
>>   Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 
>> 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 
>> <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0
>>   RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087
>>   RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018
>>   RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000
>>   RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000
>>   R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
>>   R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708
>>   FS:  0000000000000000(0000) GS:ffff8a48a0a80000(0000) 
>> knlGS:0000000000000000
>>   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>   CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0
>>   Call Trace:
>>    <IRQ>
>>    drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
>>    dma_fence_signal_timestamp_locked+0x7e/0x110
>>    dma_fence_signal+0x31/0x60
>>    amdgpu_fence_process+0xc4/0x140 [amdgpu]
>>    gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu]
>>    amdgpu_irq_dispatch+0xb7/0x210 [amdgpu]
>>    amdgpu_ih_process+0x86/0x100 [amdgpu]
>>    amdgpu_irq_handler+0x24/0x60 [amdgpu]
>>    __handle_irq_event_percpu+0x4b/0x190
>>    handle_irq_event_percpu+0x15/0x50
>>    handle_irq_event+0x39/0x60
>>    handle_edge_irq+0xaf/0x210
>>    __common_interrupt+0x6e/0x110
>>    common_interrupt+0xc1/0xe0
>>    </IRQ>
>>    <TASK>
>
> How is this triggered any why haven't we seen it before?

IGT has few 'amdgpu' specific testcases which is not related  to fence.

while running those test cases I have got this crash but this crash is 
not always reproducible.

~Arvind

> Christian
>
>> Signed-off-by: Arvind Yadav <Arvind.Yadav@amd.com>
>> ---
>>   drivers/gpu/drm/scheduler/sched_main.c | 7 ++++++-
>>   1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index 6684d88463b4..390272f6b126 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -172,7 +172,12 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
>>   static void drm_sched_job_done(struct drm_sched_job *s_job)
>>   {
>>       struct drm_sched_fence *s_fence = s_job->s_fence;
>> -    struct drm_gpu_scheduler *sched = s_fence->sched;
>> +    struct drm_gpu_scheduler *sched;
>> +
>> +    if (!s_fence)
>> +        return;
>> +
>> +    sched = s_fence->sched;
>>         atomic_dec(&sched->hw_rq_count);
>>       atomic_dec(sched->score);
>
Michal Kubecek Oct. 8, 2022, 1:48 a.m. UTC | #3
On Fri, Sep 30, 2022 at 09:09:56PM +0530, Yadav, Arvind wrote:
> 
> On 9/30/2022 4:56 PM, Christian König wrote:
> > Am 30.09.22 um 10:48 schrieb Arvind Yadav:
> > > BUG: kernel NULL pointer dereference, address: 0000000000000088
> > >   #PF: supervisor read access in kernel mode
> > >   #PF: error_code(0x0000) - not-present page
> > >   PGD 0 P4D 0
> > >   Oops: 0000 [#1] PREEMPT SMP NOPTI
> > >   CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1
> > >   Arvind : [dma_fence_default_wait _START] timeout = -1
> > >   Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018
> > >   RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched]
> > >   Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66
> > > 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53
> > > <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0
> > >   RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087
> > >   RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018
> > >   RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000
> > >   RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000
> > >   R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> > >   R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708
> > >   FS:  0000000000000000(0000) GS:ffff8a48a0a80000(0000)
> > > knlGS:0000000000000000
> > >   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > >   CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0
> > >   Call Trace:
> > >    <IRQ>
> > >    drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
> > >    dma_fence_signal_timestamp_locked+0x7e/0x110
> > >    dma_fence_signal+0x31/0x60
> > >    amdgpu_fence_process+0xc4/0x140 [amdgpu]
> > >    gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu]
> > >    amdgpu_irq_dispatch+0xb7/0x210 [amdgpu]
> > >    amdgpu_ih_process+0x86/0x100 [amdgpu]
> > >    amdgpu_irq_handler+0x24/0x60 [amdgpu]
> > >    __handle_irq_event_percpu+0x4b/0x190
> > >    handle_irq_event_percpu+0x15/0x50
> > >    handle_irq_event+0x39/0x60
> > >    handle_edge_irq+0xaf/0x210
> > >    __common_interrupt+0x6e/0x110
> > >    common_interrupt+0xc1/0xe0
> > >    </IRQ>
> > >    <TASK>
> > 
> > How is this triggered any why haven't we seen it before?
> 
> IGT has few 'amdgpu' specific testcases which is not related  to fence.
> 
> while running those test cases I have got this crash but this crash is not
> always reproducible.

I've encountered this crash twice (out of two attempts) with kernel
built from mainline commit 4c86114194e6 when I started firefox.

The patch below fixed the crash and does not seem to introduce any
apparent issue.

Tested-by: Michal Kubecek <mkubecek@suse.cz>

> > > Signed-off-by: Arvind Yadav <Arvind.Yadav@amd.com>
> > > ---
> > >   drivers/gpu/drm/scheduler/sched_main.c | 7 ++++++-
> > >   1 file changed, 6 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> > > b/drivers/gpu/drm/scheduler/sched_main.c
> > > index 6684d88463b4..390272f6b126 100644
> > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > @@ -172,7 +172,12 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
> > >   static void drm_sched_job_done(struct drm_sched_job *s_job)
> > >   {
> > >       struct drm_sched_fence *s_fence = s_job->s_fence;
> > > -    struct drm_gpu_scheduler *sched = s_fence->sched;
> > > +    struct drm_gpu_scheduler *sched;
> > > +
> > > +    if (!s_fence)
> > > +        return;
> > > +
> > > +    sched = s_fence->sched;
> > >         atomic_dec(&sched->hw_rq_count);
> > >       atomic_dec(sched->score);
> >
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 6684d88463b4..390272f6b126 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -172,7 +172,12 @@  drm_sched_rq_select_entity(struct drm_sched_rq *rq)
 static void drm_sched_job_done(struct drm_sched_job *s_job)
 {
 	struct drm_sched_fence *s_fence = s_job->s_fence;
-	struct drm_gpu_scheduler *sched = s_fence->sched;
+	struct drm_gpu_scheduler *sched;
+
+	if (!s_fence)
+		return;
+
+	sched = s_fence->sched;
 
 	atomic_dec(&sched->hw_rq_count);
 	atomic_dec(sched->score);