Message ID | 20150923144450.GD3383@phenom.ffwll.local (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Sep 23, 2015 at 04:44:50PM +0200, Daniel Vetter wrote: > sorry I sprinkled the locking stuff in the wrong places. Still confused > why the resume side doesn't blow up anywhere But it does: [ 69.394204] BUG: unable to handle kernel NULL pointer dereference at 0000000000000034 [ 69.402080] IP: [<ffffffff81321296>] pci_restore_msi_state+0x196/0x240 [ 69.408624] PGD 4162b8067 PUD 416581067 PMD 0 [ 69.413122] Oops: 0000 [#1] PREEMPT SMP [ 69.417101] Modules linked in: tun sha256_ssse3 sha256_generic drbg binfmt_misc ipv6 vfat fat fuse dm_crypt dm_mod kv m_amd kvm crc32_pclmul aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd amd64_edac_mod edac_mce_amd fa m15h_power k10temp amdkfd amd_iommu_v2 radeon acpi_cpufreq [ 69.443647] CPU: 4 PID: 814 Comm: kworker/u16:5 Not tainted 4.3.0-rc2+ #3 [ 69.450430] Hardware name: To be filled by O.E.M. To be filled by O.E.M./M5A97 EVO R2.0, BIOS 1503 01/16/2013 [ 69.460336] Workqueue: events_unbound async_run_entry_fn [ 69.465667] task: ffff88042a255f00 ti: ffff880428a68000 task.ti: ffff880428a68000 [ 69.473145] RIP: 0010:[<ffffffff81321296>] [<ffffffff81321296>] pci_restore_msi_state+0x196/0x240 [ 69.482131] RSP: 0018:ffff880428a6bc28 EFLAGS: 00010286 [ 69.487436] RAX: 0000000000000000 RBX: ffff88042a308000 RCX: 0000000000000000 [ 69.494568] RDX: 0000000000000001 RSI: ffffffff81304448 RDI: ffffffff816c7a1b [ 69.501700] RBP: ffff880428a6bc40 R08: 0000000000000001 R09: 0000000000522000 [ 69.508833] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 69.515965] R13: ffff88042a3087b0 R14: ffff88042a308010 R15: ffff88042a308038 [ 69.523097] FS: 00007fc91328a700(0000) GS:ffff88042ce00000(0000) knlGS:0000000000000000 [ 69.531185] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 69.536931] CR2: 0000000000000034 CR3: 00000004164c7000 CR4: 00000000000406e0 [ 69.544061] Stack: [ 69.546073] 0080002c2a3087b0 0000000000000000 ffff88042a308000 ffff880428a6bc78 [ 69.553525] ffffffff8130c141 ffff88042a308098 ffff88042a308000 0000000000000000 [ 69.560996] ffff8804284e77a8 ffffffff81961ef1 ffff880428a6bc88 ffffffff8130c2b8 [ 69.568450] Call Trace: [ 69.571044] [<ffffffff8130c141>] pci_restore_state.part.18+0xf1/0x250 [ 69.577706] [<ffffffff8130c2b8>] pci_restore_state+0x18/0x20 [ 69.583591] [<ffffffff8130f7fc>] pci_pm_restore_noirq+0x4c/0xd0 [ 69.589734] [<ffffffff8130f7b0>] ? pci_pm_freeze_noirq+0xf0/0xf0 [ 69.595966] [<ffffffff8146e847>] dpm_run_callback+0x77/0x2a0 [ 69.601850] [<ffffffff8146eb03>] device_resume_noirq+0x93/0x150 [ 69.607994] [<ffffffff8146ebdd>] async_resume_noirq+0x1d/0x50 [ 69.613967] [<ffffffff81078a06>] async_run_entry_fn+0x46/0xf0 [ 69.619939] [<ffffffff8106f548>] process_one_work+0x1f8/0x640 [ 69.625910] [<ffffffff8106f4a4>] ? process_one_work+0x154/0x640 [ 69.632054] [<ffffffff8106f9db>] worker_thread+0x4b/0x440 [ 69.637677] [<ffffffff8106f990>] ? process_one_work+0x640/0x640 [ 69.643822] [<ffffffff81075e86>] kthread+0xf6/0x110 [ 69.648927] [<ffffffff81075d90>] ? kthread_create_on_node+0x1f0/0x1f0 [ 69.655591] [<ffffffff816c893f>] ret_from_fork+0x3f/0x70 [ 69.661128] [<ffffffff81075d90>] ? kthread_create_on_node+0x1f0/0x1f0 [ 69.667794] Code: 66 89 4d ee 0f b7 c9 e8 79 41 fe ff 48 89 df e8 d1 7a ce ff 0f b6 53 4b 8b 73 38 48 8d 4d ee 48 8b 7b 10 83 c2 02 e8 1a 31 fe ff <41> 0f b6 4c 24 34 41 8b 54 24 30 be ff ff ff ff c0 e9 04 83 e1 [ 69.687986] RIP [<ffffffff81321296>] pci_restore_msi_state+0x196/0x240 [ 69.694772] RSP <ffff880428a6bc28> [ 69.698412] CR2: 0000000000000034 [ 69.701879] ---[ end trace 814dd8cc56e427ae ]--- This happens at resume - I caught the output over serial - screen is dead, it doesn't show anything because it simply locks up/panics. > ... Oh well. New patch below. Yep, this one took care of the warning in drm_helper_choose_encoder_dpms(). Thanks! Now I need to go decypher that NULL ptr deref above.
On Wed, Sep 23, 2015 at 06:06:21PM +0200, Borislav Petkov wrote: > On Wed, Sep 23, 2015 at 04:44:50PM +0200, Daniel Vetter wrote: > > sorry I sprinkled the locking stuff in the wrong places. Still confused > > why the resume side doesn't blow up anywhere > > But it does: > > [ 69.394204] BUG: unable to handle kernel NULL pointer dereference at 0000000000000034 > [ 69.402080] IP: [<ffffffff81321296>] pci_restore_msi_state+0x196/0x240 > [ 69.408624] PGD 4162b8067 PUD 416581067 PMD 0 > [ 69.413122] Oops: 0000 [#1] PREEMPT SMP > [ 69.417101] Modules linked in: tun sha256_ssse3 sha256_generic drbg binfmt_misc ipv6 vfat fat fuse dm_crypt dm_mod kv > m_amd kvm crc32_pclmul aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd amd64_edac_mod edac_mce_amd fa > m15h_power k10temp amdkfd amd_iommu_v2 radeon acpi_cpufreq > [ 69.443647] CPU: 4 PID: 814 Comm: kworker/u16:5 Not tainted 4.3.0-rc2+ #3 > [ 69.450430] Hardware name: To be filled by O.E.M. To be filled by O.E.M./M5A97 EVO R2.0, BIOS 1503 01/16/2013 > [ 69.460336] Workqueue: events_unbound async_run_entry_fn > [ 69.465667] task: ffff88042a255f00 ti: ffff880428a68000 task.ti: ffff880428a68000 > [ 69.473145] RIP: 0010:[<ffffffff81321296>] [<ffffffff81321296>] pci_restore_msi_state+0x196/0x240 > [ 69.482131] RSP: 0018:ffff880428a6bc28 EFLAGS: 00010286 > [ 69.487436] RAX: 0000000000000000 RBX: ffff88042a308000 RCX: 0000000000000000 > [ 69.494568] RDX: 0000000000000001 RSI: ffffffff81304448 RDI: ffffffff816c7a1b > [ 69.501700] RBP: ffff880428a6bc40 R08: 0000000000000001 R09: 0000000000522000 > [ 69.508833] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 > [ 69.515965] R13: ffff88042a3087b0 R14: ffff88042a308010 R15: ffff88042a308038 > [ 69.523097] FS: 00007fc91328a700(0000) GS:ffff88042ce00000(0000) knlGS:0000000000000000 > [ 69.531185] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > [ 69.536931] CR2: 0000000000000034 CR3: 00000004164c7000 CR4: 00000000000406e0 > [ 69.544061] Stack: > [ 69.546073] 0080002c2a3087b0 0000000000000000 ffff88042a308000 ffff880428a6bc78 > [ 69.553525] ffffffff8130c141 ffff88042a308098 ffff88042a308000 0000000000000000 > [ 69.560996] ffff8804284e77a8 ffffffff81961ef1 ffff880428a6bc88 ffffffff8130c2b8 > [ 69.568450] Call Trace: > [ 69.571044] [<ffffffff8130c141>] pci_restore_state.part.18+0xf1/0x250 > [ 69.577706] [<ffffffff8130c2b8>] pci_restore_state+0x18/0x20 > [ 69.583591] [<ffffffff8130f7fc>] pci_pm_restore_noirq+0x4c/0xd0 > [ 69.589734] [<ffffffff8130f7b0>] ? pci_pm_freeze_noirq+0xf0/0xf0 > [ 69.595966] [<ffffffff8146e847>] dpm_run_callback+0x77/0x2a0 > [ 69.601850] [<ffffffff8146eb03>] device_resume_noirq+0x93/0x150 > [ 69.607994] [<ffffffff8146ebdd>] async_resume_noirq+0x1d/0x50 > [ 69.613967] [<ffffffff81078a06>] async_run_entry_fn+0x46/0xf0 > [ 69.619939] [<ffffffff8106f548>] process_one_work+0x1f8/0x640 > [ 69.625910] [<ffffffff8106f4a4>] ? process_one_work+0x154/0x640 > [ 69.632054] [<ffffffff8106f9db>] worker_thread+0x4b/0x440 > [ 69.637677] [<ffffffff8106f990>] ? process_one_work+0x640/0x640 > [ 69.643822] [<ffffffff81075e86>] kthread+0xf6/0x110 > [ 69.648927] [<ffffffff81075d90>] ? kthread_create_on_node+0x1f0/0x1f0 > [ 69.655591] [<ffffffff816c893f>] ret_from_fork+0x3f/0x70 > [ 69.661128] [<ffffffff81075d90>] ? kthread_create_on_node+0x1f0/0x1f0 > [ 69.667794] Code: 66 89 4d ee 0f b7 c9 e8 79 41 fe ff 48 89 df e8 d1 7a ce ff 0f b6 53 4b 8b 73 38 48 8d 4d ee 48 8b 7b 10 83 c2 02 e8 1a 31 fe ff <41> 0f b6 4c 24 34 41 8b 54 24 30 be ff ff ff ff c0 e9 04 83 e1 > [ 69.687986] RIP [<ffffffff81321296>] pci_restore_msi_state+0x196/0x240 > [ 69.694772] RSP <ffff880428a6bc28> > [ 69.698412] CR2: 0000000000000034 > [ 69.701879] ---[ end trace 814dd8cc56e427ae ]--- Ok, after some quick staring, we're at __pci_restore_msi_state(): pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control); msi_mask_irq(entry, msi_mask(entry->msi_attrib.multi_cap), entry->masked); which is: .loc 1 411 0 movq %rbx, %rdi # dev, call arch_restore_msi_irqs # .LBB1921: .LBB1922: .loc 2 902 0 movzbl 75(%rbx), %edx # dev_2(D)->msi_cap, D.31945 movl 56(%rbx), %esi # MEM[(const struct pci_dev *)dev_2(D)].devfn, MEM[(const struct pci_dev *)dev_2(D)].devfn leaq -18(%rbp), %rcx #, tmp266 movq 16(%rbx), %rdi # MEM[(const struct pci_dev *)dev_2(D)].bus, MEM[(const struct pci_dev *)dev_2(D)].bus addl $2, %edx #, D.31945 call pci_bus_read_config_word # .LBE1922: .LBE1921: .loc 1 414 0 movzbl 52(%r12), %ecx # *_85, tmp208 <--- faulting insn movl 48(%r12), %edx # _85->D.27233.D.27231.masked, D.31946 .LBB1923: .LBB1924: .loc 1 176 0 movl $-1, %esi #, D.31951 and that %r12 is supposed to contain struct msi_desc *entry in __pci_restore_msi_state(): entry = irq_get_msi_desc(dev->irq); which is .loc 4 654 0 movl 1340(%rdi), %edi # dev_2(D)->irq, dev_2(D)->irq call irq_get_irq_data # .loc 4 655 0 testq %rax, %rax # d je .L405 #, movq 16(%rax), %rax # d_62->common, d_62->common movq 16(%rax), %r12 # _63->msi_desc, D.31954 but as we see above %r12 is 0. For some reason that entry thing in __pci_restore_msi_state() is not checked for NULL even though irq_get_msi_desc() can return NULL. Maybe tglx would have an idea... Hrmmm.
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index d8319dae8358..f3f562f6d848 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1573,10 +1573,12 @@ int radeon_suspend_kms(struct drm_device *dev, bool suspend, bool fbcon) drm_kms_helper_poll_disable(dev); + drm_modeset_lock_all(dev); /* turn off display hw */ list_for_each_entry(connector, &dev->mode_config.connector_list, head) { drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); } + drm_modeset_unlock_all(dev); /* unpin the front buffers and cursors */ list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { @@ -1734,9 +1736,11 @@ int radeon_resume_kms(struct drm_device *dev, bool resume, bool fbcon) if (fbcon) { drm_helper_resume_force_mode(dev); /* turn on display hw */ + drm_modeset_lock_all(dev); list_for_each_entry(connector, &dev->mode_config.connector_list, head) { drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); } + drm_modeset_unlock_all(dev); } drm_kms_helper_poll_enable(dev);