diff mbox series

patch 46/47 causes NULL pointer deref on mt7921

Message ID 20240711175156.4465-1-spasswolf@web.de (mailing list archive)
State Not Applicable
Delegated to: Felix Fietkau
Headers show
Series patch 46/47 causes NULL pointer deref on mt7921 | expand

Commit Message

Bert Karwatzki July 11, 2024, 5:51 p.m. UTC
Since linux-next-20240711 my linux system fails to start with a NULL pointer
error. Hardware: MSI Alpha 15 Laptop Ryzen 5800H with
04:00.0 Network controller: MEDIATEK Corp. MT7921K (RZ608) Wi-Fi 6E 80MHz

[  T843] BUG: unable to handle page fault for address: ffffffffffffffa0
[  T843] #PF: supervisor read access in kernel mode
[  T843] #PF: error_code(0x0000) - not-present page
[  T843] PGD e5c81a067 P4D e5c81a067 PUD e5c81c067 PMD 0 
[  T843] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[  T843] CPU: 2 UID: 0 PID: 843 Comm: NetworkManager Not tainted 6.10.0-rc7-next-20240711-dirty #9
[  T843] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.107 11/10/2021
[  T843] RIP: 0010:mt792x_mac_link_bss_remove+0x24/0x110 [mt792x_lib]
[  T843] Code: 84 00 00 00 00 00 f3 0f 1e fa 41 56 41 55 41 54 49 89 f4 55 48 89 fd 53 48 8b 46 18 48 89 d3 44 0f b7 aa b8 00 00 00 8b 56 60 <66> 83 78 a0 00 74 0f 83 fa 0e 77 0a 4c 8b b4 d0 28 ff ff ff eb 07
[  T843] RSP: 0018:ffffbc1b43b3b688 EFLAGS: 00010282
[  T843] RAX: 0000000000000000 RBX: ffff906ab80a9f00 RCX: 000fffffffe00000
[  T843] RDX: 0000000000000000 RSI: ffff906ab80a9e20 RDI: ffff9069c1712000
[  T843] RBP: ffff9069c1712000 R08: ffff9069c0402018 R09: ffffffffab226720
[  T843] R10: 0000000000000000 R11: 0000000000000000 R12: ffff906ab80a9e20
[  T843] R13: 0000000000000013 R14: 0000000000000000 R15: ffff906ab80a9990
[  T843] FS:  00007fb2edd7b500(0000) GS:ffff90786e680000(0000) knlGS:0000000000000000
[  T843] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  T843] CR2: ffffffffffffffa0 CR3: 000000010a104000 CR4: 0000000000750ef0
[  T843] PKRU: 55555554
[  T843] Call Trace:
[  T843]  <TASK>
[  T843]  ? __die+0x1e/0x60
[  T843]  ? page_fault_oops+0x157/0x450
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? search_bpf_extables+0x5a/0x80
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? exc_page_fault+0x2bb/0x670
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? lock_timer_base+0x71/0x90
[  T843]  ? asm_exc_page_fault+0x26/0x30
[  T843]  ? mt792x_mac_link_bss_remove+0x24/0x110 [mt792x_lib]
[  T843]  ? mt792x_remove_interface+0x6e/0x90 [mt792x_lib]
[  T843]  ? ieee80211_do_stop+0x507/0x7e0 [mac80211]
[  T843]  ? ieee80211_stop+0x53/0x190 [mac80211]
[  T843]  ? __dev_close_many+0xa5/0x120
[  T843]  ? __dev_change_flags+0x18c/0x220
[  T843]  ? dev_change_flags+0x21/0x60
[  T843]  ? do_setlink+0xdf9/0x11d0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? security_sock_rcv_skb+0x33/0x50
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? __nla_validate_parse+0x61/0xd10
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? genl_done+0x53/0x80
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? netlink_dump+0x357/0x410
[  T843]  ? __rtnl_newlink+0x5d6/0x980
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? genl_family_rcv_msg_dumpit+0xdf/0xf0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? __kmalloc_cache_noprof+0x44/0x210
[  T843]  ? rtnl_newlink+0x42/0x60
[  T843]  ? rtnetlink_rcv_msg+0x152/0x3f0
[  T843]  ? mptcp_pm_nl_dump_addr+0x180/0x180
[  T843]  ? rtnl_calcit.isra.0+0x130/0x130
[  T843]  ? netlink_rcv_skb+0x56/0x100
[  T843]  ? netlink_unicast+0x199/0x290
[  T843]  ? netlink_sendmsg+0x21d/0x490
[  T843]  ? __sock_sendmsg+0x78/0x80
[  T843]  ? ____sys_sendmsg+0x23f/0x2e0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? copy_msghdr_from_user+0x68/0xa0
[  T843]  ? ___sys_sendmsg+0x81/0xd0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? crng_fast_key_erasure+0xbc/0xf0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? get_random_bytes_user+0x126/0x140
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? __fdget+0xb1/0xe0
[  T843]  ? __sys_sendmsg+0x56/0xa0
[  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T843]  ? do_syscall_64+0x5f/0x170
[  T843]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
[  T843]  </TASK>
[  T843] Modules linked in: bnep nls_ascii nls_cp437 vfat fat snd_ctl_led snd_hda_codec_realtek snd_hda_codec_generic snd_hda_scodec_component snd_hda_codec_hdmi btusb btrtl btintel snd_hda_intel uvcvideo btbcm snd_intel_dspcfg btmtk snd_hda_codec snd_soc_dmic snd_acp3x_pdm_dma snd_acp3x_rn videobuf2_vmalloc snd_hwdep uvc bluetooth videobuf2_memops snd_soc_core snd_hda_core videobuf2_v4l2 snd_pcm_oss snd_mixer_oss videodev snd_pcm snd_rn_pci_acp3x videobuf2_common snd_acp_config snd_timer msi_wmi snd_soc_acpi ecdh_generic amd_atl mc ecc sparse_keymap edac_mce_amd wmi_bmof snd ccp k10temp snd_pci_acp3x soundcore ac battery button hid_sensor_prox hid_sensor_gyro_3d hid_sensor_als hid_sensor_accel_3d hid_sensor_magn_3d hid_sensor_trigger industrialio_triggered_buffer joydev kfifo_buf industrialio amd_pmc hid_sensor_iio_common evdev hid_multitouch serio_raw mt7921e mt7921_common mt792x_lib mt76_connac_lib mt76 mac80211 libarc4 cfg80211 rfkill msr fuse nvme_fabrics configfs efi_pstore efivarfs autofs4 ext4
[  T843]  crc32c_generic mbcache jbd2 usbhid amdgpu i2c_algo_bit drm_ttm_helper xhci_pci ttm drm_exec drm_suballoc_helper xhci_hcd amdxcp drm_buddy hid_sensor_hub usbcore nvme gpu_sched mfd_core hid_generic crc32c_intel psmouse drm_display_helper amd_sfh i2c_piix4 usb_common nvme_core crc16 r8169 i2c_hid_acpi i2c_hid hid i2c_designware_platform i2c_designware_core
[  T843] CR2: ffffffffffffffa0
[  T843] ---[ end trace 0000000000000000 ]---

I bisected the error between linux-6.10-rc7 and linux-next-20240711 and the
first offending commit which showed a NULL pointer error was
https://lore.kernel.org/all/20240613030241.5771-47-sean.wang@kernel.org/
but the error is actually a different but most likely related NULL pointer error.
To debug this I added some printk to the offending commit:


With these I get the following error message on startup:

[  T858] mt792x_remove_interface 157: hw = ffff92dc11560900 vif = ffff92dbe072d970 mvif = ffff92dbe072de00 dev = ffff92dc11562000
[  T858] mt792x_link_conf_to_mconf 263: vif = ffff92dbe072d970 mvif = ffff92dbe072de00
[  T858] mt792x_vif_to_link 233: vif = ffff92dbe072d970
[  T858] mt792x_mac_link_bss_remove 122
[  T858] mt792x_mac_link_bss_remove 125
[  T858] mt792x_mac_link_bss_remove 127
[  T858] mt792x_mac_link_bss_remove 129
[  T858] mt792x_mac_link_bss_remove 132
[  T858] mt792x_mac_link_bss_remove 135: mconf->vif = 0000000000000000
[  T858] BUG: kernel NULL pointer dereference, address: 00000000000004b8
[  T858] #PF: supervisor read access in kernel mode
[  T858] #PF: error_code(0x0000) - not-present page
[  T858] PGD 0 P4D 0 
[  T858] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[  T858] CPU: 0 PID: 858 Comm: NetworkManager Not tainted 6.10.0-rc5-debug-01238-g1541d63c5fe2-dirty #30
[  T858] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.107 11/10/2021
[  T858] RIP: 0010:mt792x_remove_interface+0x1df/0x2e0 [mt792x_lib]
[  T858] Code: 85 f0 30 00 00 49 8b 4f 18 e8 5d 4f f0 f7 49 8b 47 18 41 0f b6 4f 01 ba 89 00 00 00 48 c7 c6 90 24 05 c1 48 c7 c7 36 33 05 c1 <48> 8b 80 b8 04 00 00 49 d3 e4 49 f7 d4 4c 21 a0 10 27 00 00 4c 8d
[  T858] RSP: 0018:ffff9fcf03db7698 EFLAGS: 00010246
[  T858] RAX: 0000000000000000 RBX: ffff92dbe072d970 RCX: 0000000000000000
[  T858] RDX: 0000000000000089 RSI: ffffffffc1052490 RDI: ffffffffc1053336
[  T858] RBP: ffff92dc11562000 R08: 0000000000000000 R09: 0000000000000003
[  T858] R10: ffff9fcf03db7550 R11: ffffffffba099d28 R12: 0000000000000001
[  T858] R13: ffff92dbe072ded8 R14: ffff92dc1156a150 R15: ffff92dbe072de00
[  T858] FS:  00007fa13c515500(0000) GS:ffff92ea6e600000(0000) knlGS:0000000000000000
[  T858] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  T858] CR2: 00000000000004b8 CR3: 00000001046a2000 CR4: 0000000000750ef0
[  T858] PKRU: 55555554
[  T858] Call Trace:
[  T858]  <TASK>
[  T858]  ? __die+0x1e/0x60
[  T858]  ? page_fault_oops+0x157/0x450
[  T858]  ? _prb_read_valid+0x273/0x2e0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? exc_page_fault+0x331/0x670
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? prb_read_valid+0x16/0x20
[  T858]  ? asm_exc_page_fault+0x26/0x30
[  T858]  ? mt792x_remove_interface+0x1df/0x2e0 [mt792x_lib]
[  T858]  ? mt792x_remove_interface+0x1c3/0x2e0 [mt792x_lib]
[  T858]  ? ieee80211_do_stop+0x507/0x7e0 [mac80211]
[  T858]  ? ieee80211_stop+0x53/0x190 [mac80211]
[  T858]  ? __dev_close_many+0xa5/0x120
[  T858]  ? __dev_change_flags+0x18c/0x220
[  T858]  ? dev_change_flags+0x21/0x60
[  T858]  ? do_setlink+0xdf9/0x11d0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? __kmalloc_large_node+0x7e/0xb0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? security_sock_rcv_skb+0x33/0x50
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? __nla_validate_parse+0x61/0xd10
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? genl_done+0x53/0x80
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? netlink_dump+0x357/0x410
[  T858]  ? __rtnl_newlink+0x5d1/0x980
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? genl_family_rcv_msg_dumpit+0xdf/0xf0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? kmalloc_trace_noprof+0x44/0x210
[  T858]  ? rtnl_newlink+0x42/0x60
[  T858]  ? rtnetlink_rcv_msg+0x14d/0x3f0
[  T858]  ? mptcp_pm_nl_dump_addr+0x180/0x180
[  T858]  ? rtnl_calcit.isra.0+0x130/0x130
[  T858]  ? netlink_rcv_skb+0x56/0x100
[  T858]  ? netlink_unicast+0x199/0x290
[  T858]  ? netlink_sendmsg+0x21d/0x490
[  T858]  ? __sock_sendmsg+0x78/0x80
[  T858]  ? ____sys_sendmsg+0x23f/0x2e0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? copy_msghdr_from_user+0x68/0xa0
[  T858]  ? ___sys_sendmsg+0x81/0xd0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? crng_fast_key_erasure+0xbc/0xf0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? get_random_bytes_user+0x126/0x140
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? __fdget+0xb1/0xe0
[  T858]  ? __sys_sendmsg+0x56/0xa0
[  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T858]  ? do_syscall_64+0x5f/0x170
[  T858]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
[  T858]  </TASK>
[  T858] Modules linked in: cmac bnep nls_ascii nls_cp437 vfat fat snd_ctl_led snd_hda_codec_realtek snd_hda_codec_generic snd_hda_scodec_component btusb snd_hda_codec_hdmi btrtl btintel btbcm btmtk snd_hda_intel amd_atl snd_intel_dspcfg bluetooth snd_acp3x_pdm_dma snd_soc_dmic snd_acp3x_rn snd_hda_codec uvcvideo snd_soc_core videobuf2_vmalloc uvc snd_hwdep videobuf2_memops videobuf2_v4l2 snd_hda_core videodev snd_pcm_oss snd_mixer_oss snd_pcm snd_rn_pci_acp3x snd_acp_config videobuf2_common snd_timer msi_wmi snd_soc_acpi ecdh_generic ecc mc sparse_keymap edac_mce_amd snd wmi_bmof k10temp ccp snd_pci_acp3x soundcore battery ac button joydev hid_sensor_magn_3d hid_sensor_gyro_3d hid_sensor_als hid_sensor_accel_3d hid_sensor_prox hid_sensor_trigger industrialio_triggered_buffer kfifo_buf industrialio amd_pmc hid_sensor_iio_common evdev hid_multitouch serio_raw mt7921e mt7921_common mt792x_lib mt76_connac_lib mt76 mac80211 libarc4 cfg80211 rfkill msr fuse nvme_fabrics efi_pstore configfs efivarfs autofs4 ext4
[  T858]  crc32c_generic crc16 mbcache jbd2 usbhid amdgpu i2c_algo_bit xhci_pci drm_ttm_helper ttm xhci_hcd drm_exec drm_suballoc_helper amdxcp nvme drm_buddy hid_sensor_hub usbcore gpu_sched nvme_core mfd_core hid_generic crc32c_intel psmouse amd_sfh i2c_piix4 usb_common t10_pi drm_display_helper r8169 i2c_hid_acpi i2c_hid hid i2c_designware_platform i2c_designware_core
[  T858] CR2: 00000000000004b8
[  T858] ---[ end trace 0000000000000000 ]---

So the problem is here that mconf->vif is still NULL probably because
on mt7921 nobody is bothering to set it.

I did a similar investigation for the error in linux-next-20240711

void mt792x_mac_link_bss_remove(struct mt792x_dev *dev,
				struct mt792x_bss_conf *mconf,
				struct mt792x_link_sta *mlink)
{
	struct ieee80211_vif *vif = container_of((void *)mconf->vif,
						 struct ieee80211_vif, drv_priv);
	struct ieee80211_bss_conf *link_conf;
	int idx = mlink->wcid.idx;

	printk(KERN_INFO "%s %d: dev = %px mconf = %px mlink = %px vif = %px\n",
			__func__, __LINE__, dev, mconf, mlink, vif);
	link_conf = mt792x_vif_to_bss_conf(vif, mconf->link_id);

This leads to the following message on startup
 
[  T848] mt792x_mac_link_bss_remove 147: dev = ffff9403c1672000 mconf = ffff9403c1a35e20 mlink = ffff9403c1a35f00 vif = fffffffffffffb70
[  T848] BUG: unable to handle page fault for address: ffffffffffffffa0
[  T848] #PF: supervisor read access in kernel mode
[  T848] #PF: error_code(0x0000) - not-present page
[skipped backtrace]

showing that vif is an invalid (though not NULL) pointer here, too.

Bert Karwatzki

Comments

Bert Karwatzki July 11, 2024, 6:05 p.m. UTC | #1
The mt7925_ops struct contains these four ieee80211_ops which
are not present in the mt7921_ops struct. In mt7925_change_vif_links
mconf->vif is set.

from drivers/net/wireless/mediatek/mt76/mt7925/main.c:
	.vif_cfg_changed = mt7925_vif_cfg_changed,
	.link_info_changed = mt7925_link_info_changed,
	.change_vif_links = mt7925_change_vif_links,
	.change_sta_links = mt7925_change_sta_links,

So I think you're probably breaking generic code (i. e. code that is also used
on mt7921) with mt7925 specific changes.

Bert Karwatzki
Sean Wang July 11, 2024, 11:40 p.m. UTC | #2
Hi Bert,

Thanks for the detailed debug log. I've quickly made a change to fix
the issue. Right now, I can't access the test environment, but I'll
test it and send it out as soon as possible. Here's the patch.

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 2e6268cb06c0..1bab93d049df 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
struct ieee80211_vif *vif)

        mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
        mvif->phy = phy;
+       mvif->bss_conf.vif = mvif;
        mvif->bss_conf.mt76.band_idx = 0;
        mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
MT76_CONNAC_MAX_WMM_SETS;

On Thu, Jul 11, 2024 at 12:52 PM Bert Karwatzki <spasswolf@web.de> wrote:
>
> Since linux-next-20240711 my linux system fails to start with a NULL pointer
> error. Hardware: MSI Alpha 15 Laptop Ryzen 5800H with
> 04:00.0 Network controller: MEDIATEK Corp. MT7921K (RZ608) Wi-Fi 6E 80MHz
>
> [  T843] BUG: unable to handle page fault for address: ffffffffffffffa0
> [  T843] #PF: supervisor read access in kernel mode
> [  T843] #PF: error_code(0x0000) - not-present page
> [  T843] PGD e5c81a067 P4D e5c81a067 PUD e5c81c067 PMD 0
> [  T843] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
> [  T843] CPU: 2 UID: 0 PID: 843 Comm: NetworkManager Not tainted 6.10.0-rc7-next-20240711-dirty #9
> [  T843] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.107 11/10/2021
> [  T843] RIP: 0010:mt792x_mac_link_bss_remove+0x24/0x110 [mt792x_lib]
> [  T843] Code: 84 00 00 00 00 00 f3 0f 1e fa 41 56 41 55 41 54 49 89 f4 55 48 89 fd 53 48 8b 46 18 48 89 d3 44 0f b7 aa b8 00 00 00 8b 56 60 <66> 83 78 a0 00 74 0f 83 fa 0e 77 0a 4c 8b b4 d0 28 ff ff ff eb 07
> [  T843] RSP: 0018:ffffbc1b43b3b688 EFLAGS: 00010282
> [  T843] RAX: 0000000000000000 RBX: ffff906ab80a9f00 RCX: 000fffffffe00000
> [  T843] RDX: 0000000000000000 RSI: ffff906ab80a9e20 RDI: ffff9069c1712000
> [  T843] RBP: ffff9069c1712000 R08: ffff9069c0402018 R09: ffffffffab226720
> [  T843] R10: 0000000000000000 R11: 0000000000000000 R12: ffff906ab80a9e20
> [  T843] R13: 0000000000000013 R14: 0000000000000000 R15: ffff906ab80a9990
> [  T843] FS:  00007fb2edd7b500(0000) GS:ffff90786e680000(0000) knlGS:0000000000000000
> [  T843] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  T843] CR2: ffffffffffffffa0 CR3: 000000010a104000 CR4: 0000000000750ef0
> [  T843] PKRU: 55555554
> [  T843] Call Trace:
> [  T843]  <TASK>
> [  T843]  ? __die+0x1e/0x60
> [  T843]  ? page_fault_oops+0x157/0x450
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? search_bpf_extables+0x5a/0x80
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? exc_page_fault+0x2bb/0x670
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? lock_timer_base+0x71/0x90
> [  T843]  ? asm_exc_page_fault+0x26/0x30
> [  T843]  ? mt792x_mac_link_bss_remove+0x24/0x110 [mt792x_lib]
> [  T843]  ? mt792x_remove_interface+0x6e/0x90 [mt792x_lib]
> [  T843]  ? ieee80211_do_stop+0x507/0x7e0 [mac80211]
> [  T843]  ? ieee80211_stop+0x53/0x190 [mac80211]
> [  T843]  ? __dev_close_many+0xa5/0x120
> [  T843]  ? __dev_change_flags+0x18c/0x220
> [  T843]  ? dev_change_flags+0x21/0x60
> [  T843]  ? do_setlink+0xdf9/0x11d0
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? security_sock_rcv_skb+0x33/0x50
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? __nla_validate_parse+0x61/0xd10
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? genl_done+0x53/0x80
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? netlink_dump+0x357/0x410
> [  T843]  ? __rtnl_newlink+0x5d6/0x980
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? genl_family_rcv_msg_dumpit+0xdf/0xf0
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? __kmalloc_cache_noprof+0x44/0x210
> [  T843]  ? rtnl_newlink+0x42/0x60
> [  T843]  ? rtnetlink_rcv_msg+0x152/0x3f0
> [  T843]  ? mptcp_pm_nl_dump_addr+0x180/0x180
> [  T843]  ? rtnl_calcit.isra.0+0x130/0x130
> [  T843]  ? netlink_rcv_skb+0x56/0x100
> [  T843]  ? netlink_unicast+0x199/0x290
> [  T843]  ? netlink_sendmsg+0x21d/0x490
> [  T843]  ? __sock_sendmsg+0x78/0x80
> [  T843]  ? ____sys_sendmsg+0x23f/0x2e0
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? copy_msghdr_from_user+0x68/0xa0
> [  T843]  ? ___sys_sendmsg+0x81/0xd0
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? crng_fast_key_erasure+0xbc/0xf0
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? get_random_bytes_user+0x126/0x140
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? __fdget+0xb1/0xe0
> [  T843]  ? __sys_sendmsg+0x56/0xa0
> [  T843]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T843]  ? do_syscall_64+0x5f/0x170
> [  T843]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
> [  T843]  </TASK>
> [  T843] Modules linked in: bnep nls_ascii nls_cp437 vfat fat snd_ctl_led snd_hda_codec_realtek snd_hda_codec_generic snd_hda_scodec_component snd_hda_codec_hdmi btusb btrtl btintel snd_hda_intel uvcvideo btbcm snd_intel_dspcfg btmtk snd_hda_codec snd_soc_dmic snd_acp3x_pdm_dma snd_acp3x_rn videobuf2_vmalloc snd_hwdep uvc bluetooth videobuf2_memops snd_soc_core snd_hda_core videobuf2_v4l2 snd_pcm_oss snd_mixer_oss videodev snd_pcm snd_rn_pci_acp3x videobuf2_common snd_acp_config snd_timer msi_wmi snd_soc_acpi ecdh_generic amd_atl mc ecc sparse_keymap edac_mce_amd wmi_bmof snd ccp k10temp snd_pci_acp3x soundcore ac battery button hid_sensor_prox hid_sensor_gyro_3d hid_sensor_als hid_sensor_accel_3d hid_sensor_magn_3d hid_sensor_trigger industrialio_triggered_buffer joydev kfifo_buf industrialio amd_pmc hid_sensor_iio_common evdev hid_multitouch serio_raw mt7921e mt7921_common mt792x_lib mt76_connac_lib mt76 mac80211 libarc4 cfg80211 rfkill msr fuse nvme_fabrics configfs efi_pstore efivarfs autofs4 ext4
> [  T843]  crc32c_generic mbcache jbd2 usbhid amdgpu i2c_algo_bit drm_ttm_helper xhci_pci ttm drm_exec drm_suballoc_helper xhci_hcd amdxcp drm_buddy hid_sensor_hub usbcore nvme gpu_sched mfd_core hid_generic crc32c_intel psmouse drm_display_helper amd_sfh i2c_piix4 usb_common nvme_core crc16 r8169 i2c_hid_acpi i2c_hid hid i2c_designware_platform i2c_designware_core
> [  T843] CR2: ffffffffffffffa0
> [  T843] ---[ end trace 0000000000000000 ]---
>
> I bisected the error between linux-6.10-rc7 and linux-next-20240711 and the
> first offending commit which showed a NULL pointer error was
> https://lore.kernel.org/all/20240613030241.5771-47-sean.wang@kernel.org/
> but the error is actually a different but most likely related NULL pointer error.
> To debug this I added some printk to the offending commit:
>
> diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h
> index 69eb8dac0b70..c17195559b82 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt792x.h
> +++ b/drivers/net/wireless/mediatek/mt76/mt792x.h
> @@ -230,6 +230,7 @@ mt792x_vif_to_link(struct mt792x_vif *mvif, u8 link_id)
>         struct ieee80211_vif *vif;
>
>         vif = container_of((void *)mvif, struct ieee80211_vif, drv_priv);
> +       printk(KERN_INFO "%s %d: vif = %px\n", __func__, __LINE__, vif);
>
>         if (!ieee80211_vif_is_mld(vif) ||
>             link_id >= IEEE80211_LINK_UNSPECIFIED)
> @@ -259,6 +260,7 @@ mt792x_link_conf_to_mconf(struct ieee80211_bss_conf *link_conf)
>  {
>         struct ieee80211_vif *vif = link_conf->vif;
>         struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> +       printk(KERN_INFO "%s %d: vif = %px mvif = %px\n", __func__, __LINE__, vif, mvif);
>
>         return mt792x_vif_to_link(mvif, link_conf->link_id);
>  }
> diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
> index 813296fad0ed..ff627f5986bd 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
> @@ -119,23 +119,34 @@ static void mt792x_mac_link_bss_remove(struct mt792x_dev *dev,
>  {
>         struct mt792x_bss_conf *mconf = mt792x_link_conf_to_mconf(link_conf);
>         int idx = mlink->wcid.idx;
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>
>         mt792x_mutex_acquire(dev);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>         mt76_connac_free_pending_tx_skbs(&dev->pm, &mlink->wcid);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>         mt76_connac_mcu_uni_add_dev(&dev->mphy, link_conf, &mlink->wcid, false);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>
>         rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>
>         dev->mt76.vif_mask &= ~BIT_ULL(mconf->mt76.idx);
> +       printk(KERN_INFO "%s %d: mconf->vif = %px\n", __func__, __LINE__, mconf->vif);
>         mconf->vif->phy->omac_mask &= ~BIT_ULL(mconf->mt76.omac_idx);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>         mt792x_mutex_release(dev);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>
>         spin_lock_bh(&dev->mt76.sta_poll_lock);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>         if (!list_empty(&mlink->wcid.poll_list))
>                 list_del_init(&mlink->wcid.poll_list);
>         spin_unlock_bh(&dev->mt76.sta_poll_lock);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>
>         mt76_wcid_cleanup(&dev->mt76, &mlink->wcid);
> +       printk(KERN_INFO "%s %d\n", __func__, __LINE__);
>  }
>
>  void mt792x_remove_interface(struct ieee80211_hw *hw,
> @@ -143,6 +154,8 @@ void mt792x_remove_interface(struct ieee80211_hw *hw,
>  {
>         struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
>         struct mt792x_dev *dev = mt792x_hw_dev(hw);
> +       printk(KERN_INFO "%s %d: hw = %px vif = %px mvif = %px dev = %px\n",
> +                       __func__, __LINE__, hw, vif, mvif, dev);
>
>         mt792x_mac_link_bss_remove(dev, &vif->bss_conf, &mvif->sta.deflink);
>  }
>
> With these I get the following error message on startup:
>
> [  T858] mt792x_remove_interface 157: hw = ffff92dc11560900 vif = ffff92dbe072d970 mvif = ffff92dbe072de00 dev = ffff92dc11562000
> [  T858] mt792x_link_conf_to_mconf 263: vif = ffff92dbe072d970 mvif = ffff92dbe072de00
> [  T858] mt792x_vif_to_link 233: vif = ffff92dbe072d970
> [  T858] mt792x_mac_link_bss_remove 122
> [  T858] mt792x_mac_link_bss_remove 125
> [  T858] mt792x_mac_link_bss_remove 127
> [  T858] mt792x_mac_link_bss_remove 129
> [  T858] mt792x_mac_link_bss_remove 132
> [  T858] mt792x_mac_link_bss_remove 135: mconf->vif = 0000000000000000
> [  T858] BUG: kernel NULL pointer dereference, address: 00000000000004b8
> [  T858] #PF: supervisor read access in kernel mode
> [  T858] #PF: error_code(0x0000) - not-present page
> [  T858] PGD 0 P4D 0
> [  T858] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
> [  T858] CPU: 0 PID: 858 Comm: NetworkManager Not tainted 6.10.0-rc5-debug-01238-g1541d63c5fe2-dirty #30
> [  T858] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.107 11/10/2021
> [  T858] RIP: 0010:mt792x_remove_interface+0x1df/0x2e0 [mt792x_lib]
> [  T858] Code: 85 f0 30 00 00 49 8b 4f 18 e8 5d 4f f0 f7 49 8b 47 18 41 0f b6 4f 01 ba 89 00 00 00 48 c7 c6 90 24 05 c1 48 c7 c7 36 33 05 c1 <48> 8b 80 b8 04 00 00 49 d3 e4 49 f7 d4 4c 21 a0 10 27 00 00 4c 8d
> [  T858] RSP: 0018:ffff9fcf03db7698 EFLAGS: 00010246
> [  T858] RAX: 0000000000000000 RBX: ffff92dbe072d970 RCX: 0000000000000000
> [  T858] RDX: 0000000000000089 RSI: ffffffffc1052490 RDI: ffffffffc1053336
> [  T858] RBP: ffff92dc11562000 R08: 0000000000000000 R09: 0000000000000003
> [  T858] R10: ffff9fcf03db7550 R11: ffffffffba099d28 R12: 0000000000000001
> [  T858] R13: ffff92dbe072ded8 R14: ffff92dc1156a150 R15: ffff92dbe072de00
> [  T858] FS:  00007fa13c515500(0000) GS:ffff92ea6e600000(0000) knlGS:0000000000000000
> [  T858] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  T858] CR2: 00000000000004b8 CR3: 00000001046a2000 CR4: 0000000000750ef0
> [  T858] PKRU: 55555554
> [  T858] Call Trace:
> [  T858]  <TASK>
> [  T858]  ? __die+0x1e/0x60
> [  T858]  ? page_fault_oops+0x157/0x450
> [  T858]  ? _prb_read_valid+0x273/0x2e0
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? exc_page_fault+0x331/0x670
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? prb_read_valid+0x16/0x20
> [  T858]  ? asm_exc_page_fault+0x26/0x30
> [  T858]  ? mt792x_remove_interface+0x1df/0x2e0 [mt792x_lib]
> [  T858]  ? mt792x_remove_interface+0x1c3/0x2e0 [mt792x_lib]
> [  T858]  ? ieee80211_do_stop+0x507/0x7e0 [mac80211]
> [  T858]  ? ieee80211_stop+0x53/0x190 [mac80211]
> [  T858]  ? __dev_close_many+0xa5/0x120
> [  T858]  ? __dev_change_flags+0x18c/0x220
> [  T858]  ? dev_change_flags+0x21/0x60
> [  T858]  ? do_setlink+0xdf9/0x11d0
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? __kmalloc_large_node+0x7e/0xb0
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? security_sock_rcv_skb+0x33/0x50
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? __nla_validate_parse+0x61/0xd10
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? genl_done+0x53/0x80
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? netlink_dump+0x357/0x410
> [  T858]  ? __rtnl_newlink+0x5d1/0x980
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? genl_family_rcv_msg_dumpit+0xdf/0xf0
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? kmalloc_trace_noprof+0x44/0x210
> [  T858]  ? rtnl_newlink+0x42/0x60
> [  T858]  ? rtnetlink_rcv_msg+0x14d/0x3f0
> [  T858]  ? mptcp_pm_nl_dump_addr+0x180/0x180
> [  T858]  ? rtnl_calcit.isra.0+0x130/0x130
> [  T858]  ? netlink_rcv_skb+0x56/0x100
> [  T858]  ? netlink_unicast+0x199/0x290
> [  T858]  ? netlink_sendmsg+0x21d/0x490
> [  T858]  ? __sock_sendmsg+0x78/0x80
> [  T858]  ? ____sys_sendmsg+0x23f/0x2e0
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? copy_msghdr_from_user+0x68/0xa0
> [  T858]  ? ___sys_sendmsg+0x81/0xd0
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? crng_fast_key_erasure+0xbc/0xf0
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? get_random_bytes_user+0x126/0x140
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? __fdget+0xb1/0xe0
> [  T858]  ? __sys_sendmsg+0x56/0xa0
> [  T858]  ? srso_alias_return_thunk+0x5/0xfbef5
> [  T858]  ? do_syscall_64+0x5f/0x170
> [  T858]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
> [  T858]  </TASK>
> [  T858] Modules linked in: cmac bnep nls_ascii nls_cp437 vfat fat snd_ctl_led snd_hda_codec_realtek snd_hda_codec_generic snd_hda_scodec_component btusb snd_hda_codec_hdmi btrtl btintel btbcm btmtk snd_hda_intel amd_atl snd_intel_dspcfg bluetooth snd_acp3x_pdm_dma snd_soc_dmic snd_acp3x_rn snd_hda_codec uvcvideo snd_soc_core videobuf2_vmalloc uvc snd_hwdep videobuf2_memops videobuf2_v4l2 snd_hda_core videodev snd_pcm_oss snd_mixer_oss snd_pcm snd_rn_pci_acp3x snd_acp_config videobuf2_common snd_timer msi_wmi snd_soc_acpi ecdh_generic ecc mc sparse_keymap edac_mce_amd snd wmi_bmof k10temp ccp snd_pci_acp3x soundcore battery ac button joydev hid_sensor_magn_3d hid_sensor_gyro_3d hid_sensor_als hid_sensor_accel_3d hid_sensor_prox hid_sensor_trigger industrialio_triggered_buffer kfifo_buf industrialio amd_pmc hid_sensor_iio_common evdev hid_multitouch serio_raw mt7921e mt7921_common mt792x_lib mt76_connac_lib mt76 mac80211 libarc4 cfg80211 rfkill msr fuse nvme_fabrics efi_pstore configfs efivarfs autofs4 ext4
> [  T858]  crc32c_generic crc16 mbcache jbd2 usbhid amdgpu i2c_algo_bit xhci_pci drm_ttm_helper ttm xhci_hcd drm_exec drm_suballoc_helper amdxcp nvme drm_buddy hid_sensor_hub usbcore gpu_sched nvme_core mfd_core hid_generic crc32c_intel psmouse amd_sfh i2c_piix4 usb_common t10_pi drm_display_helper r8169 i2c_hid_acpi i2c_hid hid i2c_designware_platform i2c_designware_core
> [  T858] CR2: 00000000000004b8
> [  T858] ---[ end trace 0000000000000000 ]---
>
> So the problem is here that mconf->vif is still NULL probably because
> on mt7921 nobody is bothering to set it.
>
> I did a similar investigation for the error in linux-next-20240711
>
> void mt792x_mac_link_bss_remove(struct mt792x_dev *dev,
>                                 struct mt792x_bss_conf *mconf,
>                                 struct mt792x_link_sta *mlink)
> {
>         struct ieee80211_vif *vif = container_of((void *)mconf->vif,
>                                                  struct ieee80211_vif, drv_priv);
>         struct ieee80211_bss_conf *link_conf;
>         int idx = mlink->wcid.idx;
>
>         printk(KERN_INFO "%s %d: dev = %px mconf = %px mlink = %px vif = %px\n",
>                         __func__, __LINE__, dev, mconf, mlink, vif);
>         link_conf = mt792x_vif_to_bss_conf(vif, mconf->link_id);
>
> This leads to the following message on startup
>
> [  T848] mt792x_mac_link_bss_remove 147: dev = ffff9403c1672000 mconf = ffff9403c1a35e20 mlink = ffff9403c1a35f00 vif = fffffffffffffb70
> [  T848] BUG: unable to handle page fault for address: ffffffffffffffa0
> [  T848] #PF: supervisor read access in kernel mode
> [  T848] #PF: error_code(0x0000) - not-present page
> [skipped backtrace]
>
> showing that vif is an invalid (though not NULL) pointer here, too.
>
> Bert Karwatzki
Bert Karwatzki July 12, 2024, 8:57 a.m. UTC | #3
Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
> Hi Bert,
>
> Thanks for the detailed debug log. I've quickly made a change to fix
> the issue. Right now, I can't access the test environment, but I'll
> test it and send it out as soon as possible. Here's the patch.
>
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> index 2e6268cb06c0..1bab93d049df 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
> struct ieee80211_vif *vif)
>
>         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
>         mvif->phy = phy;
> +       mvif->bss_conf.vif = mvif;
>         mvif->bss_conf.mt76.band_idx = 0;
>         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> MT76_CONNAC_MAX_WMM_SETS;
>

I tested your patch with linux-next-20240711 where it works fine and with linux-
next-20240712 where the mediatek related error are solved, too, but gdm hangs on
startup (with no related log messages, which I'm sure is a different problem).

Bert Karwatzki
Bert Karwatzki July 12, 2024, 11:06 a.m. UTC | #4
Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
> Hi Bert,
>
> Thanks for the detailed debug log. I've quickly made a change to fix
> the issue. Right now, I can't access the test environment, but I'll
> test it and send it out as soon as possible. Here's the patch.
>
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> index 2e6268cb06c0..1bab93d049df 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
> struct ieee80211_vif *vif)
>
>         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
>         mvif->phy = phy;
> +       mvif->bss_conf.vif = mvif;
>         mvif->bss_conf.mt76.band_idx = 0;
>         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> MT76_CONNAC_MAX_WMM_SETS;
>

I wrote earlier that this patch works fine with linux-next-20240711 and at first
it did, but then another NULL pointer error occured. I'm not sure if I can
bisect this as it does not trigger automatically it seems. Also I'm currently
bisecting the problem with linux-20240712

Bert Karwatzki

[  T853] BUG: kernel NULL pointer dereference, address: 0000000000000008
[  T853] #PF: supervisor read access in kernel mode
[  T853] #PF: error_code(0x0000) - not-present page
[  T853] PGD 0 P4D 0
[  T853] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
[  T853] CPU: 7 UID: 0 PID: 853 Comm: NetworkManager Not tainted 6.10.0-rc7-
next-20240711-debug-dirty #32
[  T853] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-
158L, BIOS E158LAMS.107 11/10/2021
[  T853] RIP: 0010:mt7921_ipv6_addr_change+0x3b/0x1d0 [mt7921_common]
[  T853] Code: 68 02 00 00 41 54 4c 89 ef 53 48 89 d3 48 83 e4 f0 48 83 ec 60 65
48 8b 04 25 28 00 00 00 48 89 44 24 58 48 8b 86 70 09 00 00 <4c> 8b 60 08 0f b6
86 90 04 00 00 48 c7 44 24 05 00 00 00 00 88 44
[  T853] RSP: 0018:ffffb1d203ef35e0 EFLAGS: 00010282
[  T853] RAX: 0000000000000000 RBX: ffff8c73c1ada000 RCX: 0000000000000000
[  T853] RDX: ffff8c73c1ada000 RSI: ffff8c73849d9990 RDI: ffff8c73c1ada268
[  T853] RBP: ffffb1d203ef3668 R08: ffff8c73c19703c0 R09: 0000000000000000
[  T853] R10: ffffb1d203ef36b0 R11: 0000000000000000 R12: 0000000000000002
[  T853] R13: ffff8c73c1ada268 R14: 00000000ffffffff R15: ffff8c73846b08d8
[  T853] FS:  00007f640a9e6500(0000) GS:ffff8c822e5c0000(0000)
knlGS:0000000000000000
[  T853] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  T853] CR2: 0000000000000008 CR3: 00000001043e0000 CR4: 0000000000750ef0
[  T853] PKRU: 55555554
[  T853] Call Trace:
[  T853]  <TASK>
[  T853]  ? __die+0x1e/0x60
[  T853]  ? page_fault_oops+0x157/0x450
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? pollwake+0x73/0xa0
[  T853]  ? wake_up_state+0x10/0x10
[  T853]  ? exc_page_fault+0x331/0x670
[  T853]  ? asm_exc_page_fault+0x26/0x30
[  T853]  ? mt7921_ipv6_addr_change+0x3b/0x1d0 [mt7921_common]
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? __ipv6_ifa_notify+0x16f/0x4d0
[  T853]  ? ieee80211_ifa6_changed+0x5e/0x70 [mac80211]
[  T853]  ? atomic_notifier_call_chain+0x51/0x80
[  T853]  ? addrconf_ifdown.isra.0+0x43f/0x810
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? addrconf_notify+0x15d/0x760
[  T853]  ? __timer_delete_sync+0x70/0xd0
[  T853]  ? raw_notifier_call_chain+0x43/0x60
[  T853]  ? __dev_notify_flags+0x91/0xf0
[  T853]  ? dev_change_flags+0x4f/0x60
[  T853]  ? do_setlink+0xdf9/0x11d0
[  T853]  ? __skb_flow_dissect+0xc30/0x1a40
[  T853]  ? drain_stock+0x63/0xa0
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? __nla_validate_parse+0x61/0xd10
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? update_load_avg+0x77/0x6d0
[  T853]  ? __rtnl_newlink+0x5d6/0x980
[  T853]  ? __entry_text_end+0x102398/0x10239b
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? kmem_cache_alloc_node_noprof+0x4c/0x220
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? __kmalloc_cache_noprof+0x44/0x210
[  T853]  ? rtnl_newlink+0x42/0x60
[  T853]  ? rtnetlink_rcv_msg+0x152/0x3f0
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? netlink_unicast+0x238/0x290
[  T853]  ? rtnl_calcit.isra.0+0x130/0x130
[  T853]  ? netlink_rcv_skb+0x56/0x100
[  T853]  ? netlink_unicast+0x199/0x290
[  T853]  ? netlink_sendmsg+0x21d/0x490
[  T853]  ? __sock_sendmsg+0x78/0x80
[  T853]  ? ____sys_sendmsg+0x23f/0x2e0
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? copy_msghdr_from_user+0x68/0xa0
[  T853]  ? ___sys_sendmsg+0x81/0xd0
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? crng_fast_key_erasure+0xbc/0xf0
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? get_random_bytes_user+0x126/0x140
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? __fdget+0xb1/0xe0
[  T853]  ? __sys_sendmsg+0x56/0xa0
[  T853]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T853]  ? do_syscall_64+0x5f/0x170
[  T853]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
[  T853]  </TASK>
[  T853] Modules linked in: ccm snd_seq_dummy snd_hrtimer snd_seq_midi
snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device rfcomm cmac bnep nls_ascii
nls_cp437 vfat fat snd_ctl_led snd_hda_codec_realtek snd_hda_codec_generic
snd_hda_scodec_component snd_hda_codec_hdmi btusb btrtl btintel snd_hda_intel
btbcm btmtk snd_intel_dspcfg snd_hda_codec snd_soc_dmic snd_acp3x_pdm_dma
snd_acp3x_rn bluetooth snd_soc_core uvcvideo snd_hwdep snd_hda_core
videobuf2_vmalloc uvc videobuf2_memops videobuf2_v4l2 snd_pcm_oss videodev
snd_mixer_oss snd_pcm snd_rn_pci_acp3x hid_sensor_gyro_3d hid_sensor_magn_3d
hid_sensor_accel_3d hid_sensor_als hid_sensor_prox snd_acp_config
videobuf2_common snd_timer hid_sensor_trigger msi_wmi snd_soc_acpi ecdh_generic
amd_atl ecc mc snd edac_mce_amd wmi_bmof sparse_keymap ccp soundcore k10temp
snd_pci_acp3x button industrialio_triggered_buffer battery joydev ac kfifo_buf
industrialio hid_sensor_iio_common amd_pmc evdev hid_multitouch serio_raw
mt7921e mt7921_common mt792x_lib mt76_connac_lib mt76
[  T853]  mac80211 libarc4 cfg80211 rfkill msr fuse nvme_fabrics efi_pstore
configfs efivarfs autofs4 ext4 crc32c_generic mbcache jbd2 usbhid amdgpu
i2c_algo_bit drm_ttm_helper ttm drm_exec xhci_pci drm_suballoc_helper amdxcp
drm_buddy xhci_hcd gpu_sched hid_sensor_hub drm_display_helper usbcore nvme
mfd_core hid_generic crc32c_intel psmouse amd_sfh i2c_piix4 usb_common crc16
nvme_core r8169 i2c_hid_acpi i2c_hid hid i2c_designware_platform
i2c_designware_core
[  T853] CR2: 0000000000000008
[  T853] ---[ end trace 0000000000000000 ]---
Bert Karwatzki July 17, 2024, 2:38 p.m. UTC | #5
Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki:
> Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
> > Hi Bert,
> >
> > Thanks for the detailed debug log. I've quickly made a change to fix
> > the issue. Right now, I can't access the test environment, but I'll
> > test it and send it out as soon as possible. Here's the patch.
> >
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > index 2e6268cb06c0..1bab93d049df 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
> > struct ieee80211_vif *vif)
> >
> >         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> >         mvif->phy = phy;
> > +       mvif->bss_conf.vif = mvif;
> >         mvif->bss_conf.mt76.band_idx = 0;
> >         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > MT76_CONNAC_MAX_WMM_SETS;
> >
>
> I wrote earlier that this patch works fine with linux-next-20240711 and at first
> it did, but then another NULL pointer error occured. I'm not sure if I can
> bisect this as it does not trigger automatically it seems. Also I'm currently
> bisecting the problem with linux-20240712
>
> Bert Karwatzki

Now the above mentioned NULL pointer dereference has happended again, this time
on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off
and on again. For further investigation I created this patch:

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 2e6268cb06c0..3ecedf7bc9f3 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct
ieee80211_vif *vif)

 	mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
 	mvif->phy = phy;
+	WARN(!phy, "%s: phy = NULL\n", __func__);
+	mvif->bss_conf.vif = mvif;
 	mvif->bss_conf.mt76.band_idx = 0;
 	mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
MT76_CONNAC_MAX_WMM_SETS;

@@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
*hw,
 				    struct inet6_dev *idev)
 {
 	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
+	printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif);
+	printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy);
+	if (!mvif->phy) {
+		printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__);
+		return;
+	}
 	struct mt792x_dev *dev = mvif->phy->dev;
 	struct inet6_ifaddr *ifa;
 	struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];

And the result is this (the WARN in mt7921_add_interface did not trigger):

[  367.121740] [    T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by
local choice (Reason: 3=DEAUTH_LEAVING)
[  367.209603] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
[  367.209615] [    T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000
[  367.209621] [    T861] mt7921_ipv6_addr_change: mvif->phy = NULL
[  367.250026] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
[  367.250034] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
[  367.251537] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
[  367.251542] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
[  369.977123] [    T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local
address=c8:94:02:c1:bd:69)
[  369.984864] [    T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3)
[  370.006199] [    T104] wlp4s0: authenticated
[  370.006680] [    T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3)
[  370.059080] [     T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2
(capab=0x511 status=0 aid=2)
[  370.064067] [     T98] wlp4s0: associated

So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
return in that case avoids the NULL pointer and mvif->phy has its usual value
again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
don't know how this could happen but perhaps you have an idea.

Bert Karwatzki
Felix Fietkau July 17, 2024, 3:25 p.m. UTC | #6
On 17.07.24 16:38, Bert Karwatzki wrote:
> Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki:
>> Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
>> > Hi Bert,
>> >
>> > Thanks for the detailed debug log. I've quickly made a change to fix
>> > the issue. Right now, I can't access the test environment, but I'll
>> > test it and send it out as soon as possible. Here's the patch.
>> >
>> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> > index 2e6268cb06c0..1bab93d049df 100644
>> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
>> > struct ieee80211_vif *vif)
>> >
>> >         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
>> >         mvif->phy = phy;
>> > +       mvif->bss_conf.vif = mvif;
>> >         mvif->bss_conf.mt76.band_idx = 0;
>> >         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
>> > MT76_CONNAC_MAX_WMM_SETS;
>> >
>>
>> I wrote earlier that this patch works fine with linux-next-20240711 and at first
>> it did, but then another NULL pointer error occured. I'm not sure if I can
>> bisect this as it does not trigger automatically it seems. Also I'm currently
>> bisecting the problem with linux-20240712
>>
>> Bert Karwatzki
> 
> Now the above mentioned NULL pointer dereference has happended again, this time
> on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off
> and on again. For further investigation I created this patch:
> 
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> index 2e6268cb06c0..3ecedf7bc9f3 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct
> ieee80211_vif *vif)
> 
>   	mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
>   	mvif->phy = phy;
> +	WARN(!phy, "%s: phy = NULL\n", __func__);
> +	mvif->bss_conf.vif = mvif;
>   	mvif->bss_conf.mt76.band_idx = 0;
>   	mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> MT76_CONNAC_MAX_WMM_SETS;
> 
> @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
> *hw,
>   				    struct inet6_dev *idev)
>   {
>   	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> +	printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif);
> +	printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy);
> +	if (!mvif->phy) {
> +		printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__);
> +		return;
> +	}
>   	struct mt792x_dev *dev = mvif->phy->dev;
>   	struct inet6_ifaddr *ifa;
>   	struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
> 
> And the result is this (the WARN in mt7921_add_interface did not trigger):
> 
> [  367.121740] [    T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by
> local choice (Reason: 3=DEAUTH_LEAVING)
> [  367.209603] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> [  367.209615] [    T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000
> [  367.209621] [    T861] mt7921_ipv6_addr_change: mvif->phy = NULL
> [  367.250026] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> [  367.250034] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> [  367.251537] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> [  367.251542] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> [  369.977123] [    T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local
> address=c8:94:02:c1:bd:69)
> [  369.984864] [    T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3)
> [  370.006199] [    T104] wlp4s0: authenticated
> [  370.006680] [    T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3)
> [  370.059080] [     T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2
> (capab=0x511 status=0 aid=2)
> [  370.064067] [     T98] wlp4s0: associated
> 
> So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> return in that case avoids the NULL pointer and mvif->phy has its usual value
> again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> don't know how this could happen but perhaps you have an idea.

This change should fix it: https://nbd.name/p/0747f54f
Please test.

Thanks,

- Felix
Bert Karwatzki July 17, 2024, 5:05 p.m. UTC | #7
Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
> On 17.07.24 16:38, Bert Karwatzki wrote:
> > Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki:
> > > Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
> > > > Hi Bert,
> > > >
> > > > Thanks for the detailed debug log. I've quickly made a change to fix
> > > > the issue. Right now, I can't access the test environment, but I'll
> > > > test it and send it out as soon as possible. Here's the patch.
> > > >
> > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > index 2e6268cb06c0..1bab93d049df 100644
> > > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
> > > > struct ieee80211_vif *vif)
> > > >
> > > >         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> > > >         mvif->phy = phy;
> > > > +       mvif->bss_conf.vif = mvif;
> > > >         mvif->bss_conf.mt76.band_idx = 0;
> > > >         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > > > MT76_CONNAC_MAX_WMM_SETS;
> > > >
> > >
> > > I wrote earlier that this patch works fine with linux-next-20240711 and at first
> > > it did, but then another NULL pointer error occured. I'm not sure if I can
> > > bisect this as it does not trigger automatically it seems. Also I'm currently
> > > bisecting the problem with linux-20240712
> > >
> > > Bert Karwatzki
> >
> > Now the above mentioned NULL pointer dereference has happended again, this time
> > on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off
> > and on again. For further investigation I created this patch:
> >
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > index 2e6268cb06c0..3ecedf7bc9f3 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct
> > ieee80211_vif *vif)
> >
> >   	mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> >   	mvif->phy = phy;
> > +	WARN(!phy, "%s: phy = NULL\n", __func__);
> > +	mvif->bss_conf.vif = mvif;
> >   	mvif->bss_conf.mt76.band_idx = 0;
> >   	mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > MT76_CONNAC_MAX_WMM_SETS;
> >
> > @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
> > *hw,
> >   				    struct inet6_dev *idev)
> >   {
> >   	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> > +	printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif);
> > +	printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy);
> > +	if (!mvif->phy) {
> > +		printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__);
> > +		return;
> > +	}
> >   	struct mt792x_dev *dev = mvif->phy->dev;
> >   	struct inet6_ifaddr *ifa;
> >   	struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
> >
> > And the result is this (the WARN in mt7921_add_interface did not trigger):
> >
> > [  367.121740] [    T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by
> > local choice (Reason: 3=DEAUTH_LEAVING)
> > [  367.209603] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.209615] [    T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000
> > [  367.209621] [    T861] mt7921_ipv6_addr_change: mvif->phy = NULL
> > [  367.250026] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.250034] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> > [  367.251537] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.251542] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> > [  369.977123] [    T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local
> > address=c8:94:02:c1:bd:69)
> > [  369.984864] [    T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3)
> > [  370.006199] [    T104] wlp4s0: authenticated
> > [  370.006680] [    T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3)
> > [  370.059080] [     T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2
> > (capab=0x511 status=0 aid=2)
> > [  370.064067] [     T98] wlp4s0: associated
> >
> > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> > return in that case avoids the NULL pointer and mvif->phy has its usual value
> > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> > don't know how this could happen but perhaps you have an idea.
>
> This change should fix it: https://nbd.name/p/0747f54f
> Please test.
>
> Thanks,
>
> - Felix
>

Your fix works. (I added a WARN() statement on the early return, to see if mvif-
>phy actually was NULL during testing).

Bert Karwatzki
Bert Karwatzki July 18, 2024, 1:10 a.m. UTC | #8
Am Mittwoch, dem 17.07.2024 um 19:05 +0200 schrieb Bert Karwatzki:
> Am
>
> Your fix works. (I added a WARN() statement on the early return, to see if mvif-
> > phy actually was NULL during testing).
>
> Bert Karwatzki
>

While your fix works there was still the question why the driver is suddenly
forgetting mvif->phy? So I've been testing with this script:

#!/bin/sh
for i in $(seq 1 100);
do
		nmcli radio wifi off
		nmcli radio wifi on
	# wait for wifi
	sleep 3
done

together with this patch:

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 4f30426afbb7..206f10473d92 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -1182,6 +1182,10 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
*hw,
                                    struct inet6_dev *idev)
 {
        struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
+       if (!mvif->phy) {
+               WARN(1, "mvif->phy == NULL\n");
+               return;
+       }
        struct mt792x_dev *dev = mvif->phy->dev;
        struct inet6_ifaddr *ifa;
        struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];


On linux-6.10 the script can be run several times without triggering a warning
while on linux-next-20240716 running the script will trigger several warnings.
In the end the result of a bisection is this as the first commit to trigger the
warning:
commit 574e609c4e6a0843a9ed53de79e00da8fb3e7437
Author: Felix Fietkau <nbd@nbd.name>
Date:   Thu Jul 4 15:09:47 2024 +0200

    wifi: mac80211: clear vif drv_priv after remove_interface when stopping

    Avoid reusing stale driver data when an interface is brought down and up
    again. In order to avoid having to duplicate the memset in every single
    driver, do it here.

    Signed-off-by: Felix Fietkau <nbd@nbd.name>
    Link: https://patch.msgid.link/20240704130947.48609-1-nbd@nbd.name
    Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 6d969d9f1ac9..97aee0a1a39a 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -689,8 +689,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data
*sdata, bool going_do

 		fallthrough;
 	default:
-		if (going_down)
-			drv_remove_interface(local, sdata);
+		if (!going_down)
+			break;
+		drv_remove_interface(local, sdata);
+
+		/* Clear private driver data to prevent reuse */
+		memset(sdata->vif.drv_priv, 0, local->hw.vif_data_size);
 	}

 	ieee80211_recalc_ps(local);

As this is in generic mac80211 code this could probably also affect other
drivers and their ipv6_addr_change function. (which in turn could easily be
fixed by an early exit when mvif->phy == NULL)

Bert Karwatzki
Bert Karwatzki July 18, 2024, 10:43 a.m. UTC | #9
I looked more close at the call trace of the warning and it seems that the
problems 
occur when shutting down the interface:
[  T847] Call Trace:
[  T847]  <TASK>
[  T847]  ? __warn+0x6a/0xc0
[  T847]  ? mt7921_ipv6_addr_change+0x1d0/0x1f0 [mt7921_common]
[  T847]  ? report_bug+0x142/0x180
[  T847]  ? handle_bug+0x3a/0x70
[  T847]  ? exc_invalid_op+0x17/0x70
[  T847]  ? asm_exc_invalid_op+0x1a/0x20
[  T847]  ? mt7921_ipv6_addr_change+0x1d0/0x1f0 [mt7921_common]
[  T847]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T847]  ? __ipv6_ifa_notify+0x16f/0x4d0
[  T847]  ? ieee80211_ifa6_changed+0x5e/0x70 [mac80211]
[  T847]  ? atomic_notifier_call_chain+0x51/0x80
[  T847]  ? addrconf_ifdown.isra.0+0x43f/0x810
[  T847]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T847]  ? addrconf_notify+0x15d/0x760
[  T847]  ? __timer_delete_sync+0x70/0xd0
[  T847]  ? raw_notifier_call_chain+0x43/0x60
[  T847]  ? dev_close_many+0xea/0x160
[  T847]  ? dev_close+0x65/0x80
[  T847]  ? cfg80211_shutdown_all_interfaces+0x48/0xe0 [cfg80211]
[  T847]  ? cfg80211_rfkill_set_block+0x25/0x40 [cfg80211]
[  T847]  ? rfkill_set_block+0x8f/0x160 [rfkill]
[  T847]  ? rfkill_fop_write+0x14e/0x1e0 [rfkill]
[  T847]  ? vfs_write+0xf3/0x420
[  T847]  ? srso_alias_return_thunk+0x5/0xfbef5
[  T847]  ? ksys_write+0xae/0xe0
[  T847]  ? do_syscall_64+0x5f/0x170
[  T847]  ? entry_SYSCALL_64_after_hwframe+0x55/0x5d
[  T847]  </TASK>
[  T847] ---[ end trace 0000000000000000 ]---

I think there's a race happening on shutdown between ipv6_addr_change (which
uses mvif->phy)
and ieee80211_do_stop (which zeros the private data including mvif->phy).

Resend with fixed formatting.

Bert Karwatzki
Bert Karwatzki July 29, 2024, 11:12 a.m. UTC | #10
Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
> On 17.07.24 16:38, Bert Karwatzki wrote:
> > Am Freitag, dem 12.07.2024 um 13:06 +0200 schrieb Bert Karwatzki:
> > > Am Donnerstag, dem 11.07.2024 um 18:40 -0500 schrieb Sean Wang:
> > > > Hi Bert,
> > > >
> > > > Thanks for the detailed debug log. I've quickly made a change to fix
> > > > the issue. Right now, I can't access the test environment, but I'll
> > > > test it and send it out as soon as possible. Here's the patch.
> > > >
> > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > index 2e6268cb06c0..1bab93d049df 100644
> > > > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > > > @@ -303,6 +303,7 @@ mt7921_add_interface(struct ieee80211_hw *hw,
> > > > struct ieee80211_vif *vif)
> > > >
> > > >         mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> > > >         mvif->phy = phy;
> > > > +       mvif->bss_conf.vif = mvif;
> > > >         mvif->bss_conf.mt76.band_idx = 0;
> > > >         mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > > > MT76_CONNAC_MAX_WMM_SETS;
> > > >
> > >
> > > I wrote earlier that this patch works fine with linux-next-20240711 and at first
> > > it did, but then another NULL pointer error occured. I'm not sure if I can
> > > bisect this as it does not trigger automatically it seems. Also I'm currently
> > > bisecting the problem with linux-20240712
> > >
> > > Bert Karwatzki
> >
> > Now the above mentioned NULL pointer dereference has happended again, this time
> > on linux-next-20240716. It cann be triggered by repeatedly turning the Wifi off
> > and on again. For further investigation I created this patch:
> >
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > index 2e6268cb06c0..3ecedf7bc9f3 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> > @@ -303,6 +303,8 @@ mt7921_add_interface(struct ieee80211_hw *hw, struct
> > ieee80211_vif *vif)
> >
> >   	mvif->bss_conf.mt76.omac_idx = mvif->bss_conf.mt76.idx;
> >   	mvif->phy = phy;
> > +	WARN(!phy, "%s: phy = NULL\n", __func__);
> > +	mvif->bss_conf.vif = mvif;
> >   	mvif->bss_conf.mt76.band_idx = 0;
> >   	mvif->bss_conf.mt76.wmm_idx = mvif->bss_conf.mt76.idx %
> > MT76_CONNAC_MAX_WMM_SETS;
> >
> > @@ -1182,6 +1184,12 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
> > *hw,
> >   				    struct inet6_dev *idev)
> >   {
> >   	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> > +	printk(KERN_INFO "%s: mvif = %px\n", __func__, mvif);
> > +	printk(KERN_INFO "%s: mvif->phy = %px\n", __func__, mvif->phy);
> > +	if (!mvif->phy) {
> > +		printk(KERN_INFO "%s: mvif->phy = NULL\n", __func__);
> > +		return;
> > +	}
> >   	struct mt792x_dev *dev = mvif->phy->dev;
> >   	struct inet6_ifaddr *ifa;
> >   	struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
> >
> > And the result is this (the WARN in mt7921_add_interface did not trigger):
> >
> > [  367.121740] [    T861] wlp4s0: deauthenticating from 54:67:51:3d:a2:d2 by
> > local choice (Reason: 3=DEAUTH_LEAVING)
> > [  367.209603] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.209615] [    T861] mt7921_ipv6_addr_change: mvif->phy = 0000000000000000
> > [  367.209621] [    T861] mt7921_ipv6_addr_change: mvif->phy = NULL
> > [  367.250026] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.250034] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> > [  367.251537] [    T861] mt7921_ipv6_addr_change: mvif = ffff954e7500de40
> > [  367.251542] [    T861] mt7921_ipv6_addr_change: mvif->phy = ffff954e44427768
> > [  369.977123] [    T862] wlp4s0: authenticate with 54:67:51:3d:a2:d2 (local
> > address=c8:94:02:c1:bd:69)
> > [  369.984864] [    T862] wlp4s0: send auth to 54:67:51:3d:a2:d2 (try 1/3)
> > [  370.006199] [    T104] wlp4s0: authenticated
> > [  370.006680] [    T104] wlp4s0: associate with 54:67:51:3d:a2:d2 (try 1/3)
> > [  370.059080] [     T98] wlp4s0: RX AssocResp from 54:67:51:3d:a2:d2
> > (capab=0x511 status=0 aid=2)
> > [  370.064067] [     T98] wlp4s0: associated
> >
> > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> > return in that case avoids the NULL pointer and mvif->phy has its usual value
> > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> > don't know how this could happen but perhaps you have an idea.
>
> This change should fix it: https://nbd.name/p/0747f54f
> Please test.
>
> Thanks,
>
> - Felix
>

The BUG is still present in linux-6.11-rc1.

Bert Karwatzki
Kalle Valo July 31, 2024, 8:51 a.m. UTC | #11
Bert Karwatzki <spasswolf@web.de> writes:

> Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
>
>> On 17.07.24 16:38, Bert Karwatzki wrote:
>>
>> > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
>> > return in that case avoids the NULL pointer and mvif->phy has its usual value
>> > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
>> > don't know how this could happen but perhaps you have an idea.
>>
>> This change should fix it: https://nbd.name/p/0747f54f
>> Please test.
>
> The BUG is still present in linux-6.11-rc1.

I'm not sure what's the status with this. There's one mt76 patch going
to v6.11-rc2:

https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3

But that looks to be a fix for a different problem, right? Felix, are
you planning to submit that 0747f54f as a proper patch? I could then
take it to wireless tree.
Bert Karwatzki Aug. 6, 2024, 11:22 a.m. UTC | #12
Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo:
> Bert Karwatzki <spasswolf@web.de> writes:
>
> > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
> >
> > > On 17.07.24 16:38, Bert Karwatzki wrote:
> > >
> > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> > > > return in that case avoids the NULL pointer and mvif->phy has its usual value
> > > > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> > > > don't know how this could happen but perhaps you have an idea.
> > >
> > > This change should fix it: https://nbd.name/p/0747f54f
> > > Please test.
> >
> > The BUG is still present in linux-6.11-rc1.
>
> I'm not sure what's the status with this. There's one mt76 patch going
> to v6.11-rc2:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3
>
> But that looks to be a fix for a different problem, right? Felix, are
> you planning to submit that 0747f54f as a proper patch? I could then
> take it to wireless tree.
>
The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the
mvif->phy NULL check in the original patch is not neccessary (and feels a little
out of place as mvif->phy is not needed anymore). This patch is sufficient to
fix the NULL pointer dereference:
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 1bab93d049df..23b228804289 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
*hw,
                                    struct inet6_dev *idev)
 {
        struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
-       struct mt792x_dev *dev = mvif->phy->dev;
+       struct mt792x_dev *dev = mt792x_hw_dev(hw);
        struct inet6_ifaddr *ifa;
        struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
        struct sk_buff *skb;

Bert Karwatzki
Bert Karwatzki Aug. 12, 2024, 8:57 a.m. UTC | #13
Am Dienstag, dem 06.08.2024 um 13:22 +0200 schrieb Bert Karwatzki:
> Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo:
> > Bert Karwatzki <spasswolf@web.de> writes:
> >
> > > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
> > >
> > > > On 17.07.24 16:38, Bert Karwatzki wrote:
> > > >
> > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
> > > > > return in that case avoids the NULL pointer and mvif->phy has its usual value
> > > > > again on the next call to mt7921_ipv6_addr_change so Wifi is working again. I
> > > > > don't know how this could happen but perhaps you have an idea.
> > > >
> > > > This change should fix it: https://nbd.name/p/0747f54f
> > > > Please test.
> > >
> > > The BUG is still present in linux-6.11-rc1.
> >
> > I'm not sure what's the status with this. There's one mt76 patch going
> > to v6.11-rc2:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3
> >
> > But that looks to be a fix for a different problem, right? Felix, are
> > you planning to submit that 0747f54f as a proper patch? I could then
> > take it to wireless tree.
> >
> The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the
> mvif->phy NULL check in the original patch is not neccessary (and feels a little
> out of place as mvif->phy is not needed anymore). This patch is sufficient to
> fix the NULL pointer dereference:
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> index 1bab93d049df..23b228804289 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
> *hw,
>                                     struct inet6_dev *idev)
>  {
>         struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> -       struct mt792x_dev *dev = mvif->phy->dev;
> +       struct mt792x_dev *dev = mt792x_hw_dev(hw);
>         struct inet6_ifaddr *ifa;
>         struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
>         struct sk_buff *skb;
>
> Bert Karwatzki

This error is still present in v6.11-rc3.

Bert Karwatzki
Kalle Valo Aug. 12, 2024, 10:29 a.m. UTC | #14
Bert Karwatzki <spasswolf@web.de> writes:

> Am Dienstag, dem 06.08.2024 um 13:22 +0200 schrieb Bert Karwatzki:
>> Am Mittwoch, dem 31.07.2024 um 11:51 +0300 schrieb Kalle Valo:
>> > Bert Karwatzki <spasswolf@web.de> writes:
>> >
>> > > Am Mittwoch, dem 17.07.2024 um 17:25 +0200 schrieb Felix Fietkau:
>> > >
>> > > > On 17.07.24 16:38, Bert Karwatzki wrote:
>> > > >
>> > > > > So mvif->phy can be NULL at the start of mt7921_ipv6_addr_change. The early
>> > > > > return in that case avoids the NULL pointer and mvif->phy
>> > > > > has its usual value
>> > > > > again on the next call to mt7921_ipv6_addr_change so Wifi is
>> > > > > working again. I
>> > > > > don't know how this could happen but perhaps you have an idea.
>> > > >
>> > > > This change should fix it: https://nbd.name/p/0747f54f
>> > > > Please test.
>> > >
>> > > The BUG is still present in linux-6.11-rc1.
>> >
>> > I'm not sure what's the status with this. There's one mt76 patch going
>> > to v6.11-rc2:
>> >
>> > https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless.git/commit/?id=6557a28f3e3a54cff4f0dcdd1dfa649b26557ab3
>> >
>> > But that looks to be a fix for a different problem, right? Felix, are
>> > you planning to submit that 0747f54f as a proper patch? I could then
>> > take it to wireless tree.
>> >
>> The Bug is still present in linux-6.11-rc2 and linux-next-20240806. Also the
>> mvif->phy NULL check in the original patch is not neccessary (and feels a little
>> out of place as mvif->phy is not needed anymore). This patch is sufficient to
>> fix the NULL pointer dereference:
>> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> index 1bab93d049df..23b228804289 100644
>> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
>> @@ -1183,7 +1183,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw
>> *hw,
>>                                     struct inet6_dev *idev)
>>  {
>>         struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
>> -       struct mt792x_dev *dev = mvif->phy->dev;
>> +       struct mt792x_dev *dev = mt792x_hw_dev(hw);
>>         struct inet6_ifaddr *ifa;
>>         struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
>>         struct sk_buff *skb;
>>
>> Bert Karwatzki
>
> This error is still present in v6.11-rc3.

Bert, can you send your fix as a proper patch? More information in the
wiki below and please mark it for wireless tree.
diff mbox series

Patch

diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h
index 69eb8dac0b70..c17195559b82 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x.h
+++ b/drivers/net/wireless/mediatek/mt76/mt792x.h
@@ -230,6 +230,7 @@  mt792x_vif_to_link(struct mt792x_vif *mvif, u8 link_id)
 	struct ieee80211_vif *vif;
 
 	vif = container_of((void *)mvif, struct ieee80211_vif, drv_priv);
+	printk(KERN_INFO "%s %d: vif = %px\n", __func__, __LINE__, vif);
 
 	if (!ieee80211_vif_is_mld(vif) ||
 	    link_id >= IEEE80211_LINK_UNSPECIFIED)
@@ -259,6 +260,7 @@  mt792x_link_conf_to_mconf(struct ieee80211_bss_conf *link_conf)
 {
 	struct ieee80211_vif *vif = link_conf->vif;
 	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
+	printk(KERN_INFO "%s %d: vif = %px mvif = %px\n", __func__, __LINE__, vif, mvif);
 
 	return mt792x_vif_to_link(mvif, link_conf->link_id);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
index 813296fad0ed..ff627f5986bd 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c
+++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
@@ -119,23 +119,34 @@  static void mt792x_mac_link_bss_remove(struct mt792x_dev *dev,
 {
 	struct mt792x_bss_conf *mconf = mt792x_link_conf_to_mconf(link_conf);
 	int idx = mlink->wcid.idx;
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	mt792x_mutex_acquire(dev);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 	mt76_connac_free_pending_tx_skbs(&dev->pm, &mlink->wcid);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 	mt76_connac_mcu_uni_add_dev(&dev->mphy, link_conf, &mlink->wcid, false);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	rcu_assign_pointer(dev->mt76.wcid[idx], NULL);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	dev->mt76.vif_mask &= ~BIT_ULL(mconf->mt76.idx);
+	printk(KERN_INFO "%s %d: mconf->vif = %px\n", __func__, __LINE__, mconf->vif);
 	mconf->vif->phy->omac_mask &= ~BIT_ULL(mconf->mt76.omac_idx);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 	mt792x_mutex_release(dev);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	spin_lock_bh(&dev->mt76.sta_poll_lock);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 	if (!list_empty(&mlink->wcid.poll_list))
 		list_del_init(&mlink->wcid.poll_list);
 	spin_unlock_bh(&dev->mt76.sta_poll_lock);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 
 	mt76_wcid_cleanup(&dev->mt76, &mlink->wcid);
+	printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 }
 
 void mt792x_remove_interface(struct ieee80211_hw *hw,
@@ -143,6 +154,8 @@  void mt792x_remove_interface(struct ieee80211_hw *hw,
 {
 	struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
 	struct mt792x_dev *dev = mt792x_hw_dev(hw);
+	printk(KERN_INFO "%s %d: hw = %px vif = %px mvif = %px dev = %px\n",
+			__func__, __LINE__, hw, vif, mvif, dev);
 
 	mt792x_mac_link_bss_remove(dev, &vif->bss_conf, &mvif->sta.deflink);
 }