Message ID | 20200108031957.22308-2-wgong@codeaurora.org (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Kalle Valo |
Headers | show |
Series | start recovery process when payload length overflow for sdio | expand |
I think this might break the "wedged" state. Would simply not taking action unless STATE ON avoid the problems with multiple calls to _restart? ie: diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 5ec16ce19b69..a6c11b2bc97c 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -2198,11 +2198,8 @@ static int ath10k_init_hw_params(struct ath10k *ar) return 0; } -static void ath10k_core_restart(struct work_struct *work) +static void inline _ath10k_core_restart(struct ath10k *ar) { - struct ath10k *ar = container_of(work, struct ath10k, restart_work); - int ret; - set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags); /* Place a barrier to make sure the compiler doesn't reorder @@ -2232,14 +2229,28 @@ static void ath10k_core_restart(struct work_struct *work) */ cancel_work_sync(&ar->set_coverage_class_work); + ath10k_halt(ar); + ath10k_scan_finish(ar); + ieee80211_restart_hw(ar->hw); + + ret = ath10k_coredump_submit(ar); + if (ret) + ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: %d", ret); + + complete(&ar->driver_recovery); +} + +static void ath10k_core_restart(struct work_struct *work) +{ + struct ath10k *ar = container_of(work, struct ath10k, restart_work); + int ret; + mutex_lock(&ar->conf_mutex); switch (ar->state) { case ATH10K_STATE_ON: ar->state = ATH10K_STATE_RESTARTING; - ath10k_halt(ar); - ath10k_scan_finish(ar); - ieee80211_restart_hw(ar->hw); + _ath10k_core_restart(ar); break; case ATH10K_STATE_OFF: /* this can happen if driver is being unloaded @@ -2262,13 +2273,6 @@ static void ath10k_core_restart(struct work_struct *work) } mutex_unlock(&ar->conf_mutex); - - ret = ath10k_coredump_submit(ar); - if (ret) - ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: %d", - ret); - - complete(&ar->driver_recovery); } static void ath10k_core_set_coverage_class_work(struct work_struct *work) On Tue, Jan 7, 2020 at 7:20 PM Wen Gong <wgong@codeaurora.org> wrote: > > When it has more than one restart_work queued meanwhile, the 2nd > restart_work is very esay to break the 1st restart work and lead > recovery fail. > > Add a ref count to allow only one restart work running untill > device successfully recovered. > > Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00029. > > Signed-off-by: Wen Gong <wgong@codeaurora.org> > --- > drivers/net/wireless/ath/ath10k/core.c | 13 +++++++++++++ > drivers/net/wireless/ath/ath10k/core.h | 2 ++ > drivers/net/wireless/ath/ath10k/mac.c | 1 + > 3 files changed, 16 insertions(+) > > diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c > index 91f131b87efc..0e31846e6c89 100644 > --- a/drivers/net/wireless/ath/ath10k/core.c > +++ b/drivers/net/wireless/ath/ath10k/core.c > @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct work_struct *work) > { > struct ath10k *ar = container_of(work, struct ath10k, restart_work); > int ret; > + int restart_count; > + > + restart_count = atomic_add_return(1, &ar->restart_count); > + if (restart_count > 1) { > + ath10k_warn(ar, "can not restart, count: %d\n", restart_count); > + atomic_dec(&ar->restart_count); > + return; > + } > > set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags); > > @@ -2231,6 +2239,11 @@ static void ath10k_core_restart(struct work_struct *work) > > mutex_lock(&ar->conf_mutex); > > + if (ar->state != ATH10K_STATE_ON) { > + ath10k_warn(ar, "state is not on: %d\n", ar->state); > + atomic_dec(&ar->restart_count); > + } > + > switch (ar->state) { > case ATH10K_STATE_ON: > ar->state = ATH10K_STATE_RESTARTING; > diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h > index e57b2e7235e3..810c99f2dc0e 100644 > --- a/drivers/net/wireless/ath/ath10k/core.h > +++ b/drivers/net/wireless/ath/ath10k/core.h > @@ -982,6 +982,8 @@ struct ath10k { > /* protected by conf_mutex */ > u8 ps_state_enable; > > + atomic_t restart_count; > + > bool nlo_enabled; > bool p2p; > > diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c > index 3856edba7915..bc1574145e66 100644 > --- a/drivers/net/wireless/ath/ath10k/mac.c > +++ b/drivers/net/wireless/ath/ath10k/mac.c > @@ -7208,6 +7208,7 @@ static void ath10k_reconfig_complete(struct ieee80211_hw *hw, > ath10k_info(ar, "device successfully recovered\n"); > ar->state = ATH10K_STATE_ON; > ieee80211_wake_queues(ar->hw); > + atomic_dec(&ar->restart_count); > } > > mutex_unlock(&ar->conf_mutex); > -- > 2.23.0
On 2020-01-08 20:02, Justin Capella wrote: > I think this might break the "wedged" state. > > Would simply not taking action unless STATE ON avoid the problems with > multiple calls to _restart? ie: > ath10k_core_restart is one part of the recovery process, after ath10k_core_restart, it has other things do in mac80211/ath10k..., it need to make sure all the recovery finished(ath10k_reconfig_complete), then the next recovery can start from ath10k_core_restart. so it can not simply change like below. > diff --git a/drivers/net/wireless/ath/ath10k/core.c > b/drivers/net/wireless/ath/ath10k/core.c > index 5ec16ce19b69..a6c11b2bc97c 100644 > --- a/drivers/net/wireless/ath/ath10k/core.c > +++ b/drivers/net/wireless/ath/ath10k/core.c > @@ -2198,11 +2198,8 @@ static int ath10k_init_hw_params(struct ath10k > *ar) > return 0; > } > > -static void ath10k_core_restart(struct work_struct *work) > +static void inline _ath10k_core_restart(struct ath10k *ar) > { > - struct ath10k *ar = container_of(work, struct ath10k, restart_work); > - int ret; > - > set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags); > > /* Place a barrier to make sure the compiler doesn't reorder > @@ -2232,14 +2229,28 @@ static void ath10k_core_restart(struct > work_struct *work) > */ > cancel_work_sync(&ar->set_coverage_class_work); > > + ath10k_halt(ar); > + ath10k_scan_finish(ar); > + ieee80211_restart_hw(ar->hw); > + > + ret = ath10k_coredump_submit(ar); > + if (ret) > + ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: > %d", ret); > + > + complete(&ar->driver_recovery); > +} > + > +static void ath10k_core_restart(struct work_struct *work) > +{ > + struct ath10k *ar = container_of(work, struct ath10k, restart_work); > + int ret; > + > mutex_lock(&ar->conf_mutex); > > switch (ar->state) { > case ATH10K_STATE_ON: > ar->state = ATH10K_STATE_RESTARTING; > - ath10k_halt(ar); > - ath10k_scan_finish(ar); > - ieee80211_restart_hw(ar->hw); > + _ath10k_core_restart(ar); > break; > case ATH10K_STATE_OFF: > /* this can happen if driver is being unloaded > @@ -2262,13 +2273,6 @@ static void ath10k_core_restart(struct > work_struct *work) > } > > mutex_unlock(&ar->conf_mutex); > - > - ret = ath10k_coredump_submit(ar); > - if (ret) > - ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: > %d", > - ret); > - > - complete(&ar->driver_recovery); > } > > static void ath10k_core_set_coverage_class_work(struct work_struct > *work) > > On Tue, Jan 7, 2020 at 7:20 PM Wen Gong <wgong@codeaurora.org> wrote:
On 2020-01-10 02:29, Wen Gong wrote: > On 2020-01-08 20:02, Justin Capella wrote: >> I think this might break the "wedged" state. >> >> Would simply not taking action unless STATE ON avoid the problems with >> multiple calls to _restart? ie: >> > > ath10k_core_restart is one part of the recovery process, > after ath10k_core_restart, it has other things do in > mac80211/ath10k..., > it need to make sure all the recovery > finished(ath10k_reconfig_complete), > then the next recovery can start from ath10k_core_restart. Here is some log for the 2 recovery run meanwhile and failed. It means if 2 recovery run meanwhile, recovery will easy failed. log steps: 1. simulate crash and then firmware crash and enter ath10k_core_restart with state=ATH10K_STATE_ON 2. it happened "exceeds length, start recovery" and queued the 2nd recovery 3. mac80211 start to recovery ath10k and state changed from ATH10K_STATE_RESTARTING to ATH10K_STATE_RESTARTED 4. enter ath10k_core_restart for 2nd recovery and state changed from ATH10K_STATE_RESTARTED to ATH10K_STATE_WEDGED 5. mac80211 start to drv_add_interface but failed because ath10k_htc_send return -ECOMM for ATH10K_STATE_WEDGED 6. mac80211 start to ieee80211_handle_reconfig_failure because drv_add_interface failed so the recovery failed. kworker/0:1 13360 [000] 2889.292173: ath10k:ath10k_log_err: ath10k_sdio mmc1:0001:1 firmware register dump: kworker/u16:3 11486 [001] 2889.292686: ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_core_restart => state :1 kworker/0:1 13360 [000] 2889.292760: ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 payload length 57005 exceeds max htc length: 4088 kworker/0:1 13360 [000] 2889.292805: ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 exceeds length, start recovery kworker/1:3 14712 [001] 2889.850142: ath10k:ath10k_log_dbg: ath10k_sdio mmc1:0001:1 wmi tlv init kworker/1:3 14712 [001] 2889.866953: ath10k:ath10k_log_dbg: ath10k_sdio mmc1:0001:1 wmi tlv scan prob req oui kworker/u16:3 11486 [001] 2889.888801: ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_core_restart => 2 state :3 kworker/u16:3 11486 [001] 2889.888808: ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 device is wedged, will not restart kworker/1:3 14712 [001] 2889.889220: ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_add_interface => state :4 kworker/1:3 14712 [001] 2889.889229: ath10k:ath10k_log_dbg: ath10k_sdio mmc1:0001:1 mac create vdev 0 map f kworker/1:3 14712 [001] 2889.889240: ath10k:ath10k_log_dbg: ath10k_sdio mmc1:0001:1 mac vdev create 0 (add interface) type 2 subtype 0 bcnmode per-skb kworker/1:3 14712 [001] 2889.889247: ath10k:ath10k_log_dbg: ath10k_sdio mmc1:0001:1 wmi tlv vdev create kworker/1:3 14712 [001] 2889.889252: ath10k:ath10k_wmi_cmd: ath10k_sdio mmc1:0001:1 id 20481 len 28 kworker/1:3 14712 [001] 2889.889260: ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 failed to create WMI vdev 0: -70 ieee80211_reconfig will ieee80211_handle_reconfig_failure because drv_add_interface failed. so the recovery failed. detail log for fail [ 2889.410150] ieee80211 phy0: Hardware restart was requested [ 2889.889281] drv_add_interface ret:-70 [ 2889.889310] ------------[ cut here ]------------ [ 2889.889547] WARNING: CPU: 1 PID: 14712 at /mnt/host/source/src/third_party/kernel/v4.19/net/mac80211/util.c:2072 ieee80211_reconfig+0x189c/0x19a4 [mac80211] [ 2889.889559] Modules linked in: rfcomm uinput cros_ec_rpmsg mtk_dip mtk_vcodec_enc mtk_seninf mtk_vcodec_dec mtk_mdp3 mtk_fd mtk_cam_isp mtk_vcodec_common videobuf2_dma_contig videobuf2_memops v4l2_mem2mem videobuf2_v4l2 videobuf2_common hci_uart btqca bluetooth hid_google_hammer ecdh_generic ov8856 dw9768 mtk_scp mtk_rpmsg ov02a10 rpmsg_core v4l2_fwnode mtk_scp_ipi nf_nat_tftp nf_conntrack_tftp nf_nat_ftp nf_conntrack_ftp esp6 ah6 xfrm6_mode_tunnel xfrm6_mode_transport xfrm4_mode_tunnel xfrm4_mode_transport ip6t_REJECT ip6t_ipv6header ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring cros_ec_light_prox cros_ec_sensors cros_ec_sensors_sync industrialio_triggered_buffer kfifo_buf cros_ec_sensors_core ath10k_sdio ath10k_core ath mac80211 lzo_rle lzo_compress zram cfg80211 ax88179_178a usbnet mii [ 2889.889689] joydev [ 2889.889709] CPU: 1 PID: 14712 Comm: kworker/1:3 Tainted: G W 4.19.95 #139 [ 2889.889720] Hardware name: MediaTek krane sku176 board (DT) [ 2889.889776] Workqueue: events_freezable ieee80211_restart_work [mac80211] [ 2889.889792] pstate: 60000005 (nZCv daif -PAN -UAO) [ 2889.889849] pc : ieee80211_reconfig+0x189c/0x19a4 [mac80211] [ 2889.889908] lr : ieee80211_reconfig+0x189c/0x19a4 [mac80211] [ 2889.889918] sp : ffffff8012743cc0 [ 2889.889927] x29: ffffff8012743d40 x28: ffffffa381e11f40 [ 2889.889941] x27: ffffffa381ad4018 x26: 0000000000000000 [ 2889.889954] x25: 00000000ffffffba x24: 0000000000000004 [ 2889.889968] x23: ffffffe3b52a2210 x22: ffffffe3c3ae12b8 [ 2889.889981] x21: ffffffe3c3ae12b8 x20: ffffffe3b4e80900 [ 2889.889995] x19: ffffffe3c3ae0800 x18: 000000000003b900 [ 2889.890008] x17: 000000000000003c x16: ffffffa3815c1540 [ 2889.890021] x15: 0000000000000006 x14: ffff001000000600 [ 2889.890034] x13: 00000000002ae45c x12: 0000000000000000 [ 2889.890047] x11: 0000000000000001 x10: 0000000000000007 [ 2889.890061] x9 : c6cfad4ea7c13400 x8 : c6cfad4ea7c13400 [ 2889.890074] x7 : 0000000000000000 x6 : ffffffa381f6cd0c [ 2889.890087] x5 : 0000000000000000 x4 : 0000000000000000 [ 2889.890100] x3 : 000000000003e2a1 x2 : fffffffffffff704 [ 2889.890113] x1 : 0000000000000000 x0 : 0000000000000024 [ 2889.890127] Call trace: [ 2889.890185] ieee80211_reconfig+0x189c/0x19a4 [mac80211] [ 2889.890236] ieee80211_restart_work+0xa0/0xd0 [mac80211] [ 2889.890258] process_one_work+0x210/0x418 [ 2889.890273] worker_thread+0x234/0x3dc [ 2889.890288] kthread+0x120/0x130 [ 2889.890303] ret_from_fork+0x10/0x18 [ 2889.890314] ---[ end trace d605ce58e22a44f2 ]--- [ 2889.898956] wlan0: deauthenticating from c4:04:15:3b:e0:38 by local choice (Reason: 3=DEAUTH_LEAVING) [ 2889.898995] ------------[ cut here ]------------ [ 2889.899006] wlan0: Failed check-sdata-in-driver check, flags: 0x0 [ 2889.899289] WARNING: CPU: 1 PID: 14712 at /mnt/host/source/src/third_party/kernel/v4.19/net/mac80211/driver-ops.h:19 ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211] [ 2889.899303] Modules linked in: rfcomm uinput cros_ec_rpmsg mtk_dip mtk_vcodec_enc mtk_seninf mtk_vcodec_dec mtk_mdp3 mtk_fd mtk_cam_isp mtk_vcodec_common videobuf2_dma_contig videobuf2_memops v4l2_mem2mem videobuf2_v4l2 videobuf2_common hci_uart btqca bluetooth hid_google_hammer ecdh_generic ov8856 dw9768 mtk_scp mtk_rpmsg ov02a10 rpmsg_core v4l2_fwnode mtk_scp_ipi nf_nat_tftp nf_conntrack_tftp nf_nat_ftp nf_conntrack_ftp esp6 ah6 xfrm6_mode_tunnel xfrm6_mode_transport xfrm4_mode_tunnel xfrm4_mode_transport ip6t_REJECT ip6t_ipv6header ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring cros_ec_light_prox cros_ec_sensors cros_ec_sensors_sync industrialio_triggered_buffer kfifo_buf cros_ec_sensors_core ath10k_sdio ath10k_core ath mac80211 lzo_rle lzo_compress zram cfg80211 ax88179_178a usbnet mii [ 2889.899438] joydev [ 2889.899475] CPU: 1 PID: 14712 Comm: kworker/1:3 Tainted: G W 4.19.95 #139 [ 2889.899486] Hardware name: MediaTek krane sku176 board (DT) [ 2889.899545] Workqueue: events_freezable ieee80211_restart_work [mac80211] [ 2889.899561] pstate: 60000005 (nZCv daif -PAN -UAO) [ 2889.899612] pc : ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211] [ 2889.899664] lr : ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211] [ 2889.899674] sp : ffffff8012743820 [ 2889.899686] x29: ffffff8012743860 x28: ffffffa381e11f40 [ 2889.899700] x27: ffffffa381ad4018 x26: 0000000000000000 [ 2889.899712] x25: 0000000000000000 x24: 00000000000000c0 [ 2889.899725] x23: 0000000000000003 x22: ffffff80127438fc [ 2889.899738] x21: ffffffe3b4e80900 x20: 0000000000020000 [ 2889.899751] x19: ffffffe3c3ae0800 x18: 000000000003c500 [ 2889.899764] x17: 000000000000003c x16: ffffffa3815c1540 [ 2889.899776] x15: 0000000000000006 x14: ffff001000000600 [ 2889.899789] x13: 00000000002ae46e x12: 0000000000000000 [ 2889.899802] x11: 0000000000000000 x10: 0000000000000000 [ 2889.899814] x9 : c6cfad4ea7c13400 x8 : c6cfad4ea7c13400 [ 2889.899827] x7 : 0000000000000000 x6 : ffffffa381f6cd1e [ 2889.899840] x5 : 0000000000000000 x4 : 0000000000000000 [ 2889.899857] x3 : 000000000003d73b x2 : ffffffe3fff05918 [ 2889.899870] x1 : ffffffe3ffefca08 x0 : 0000000000000036 [ 2889.899883] Call trace: [ 2889.899935] ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211] [ 2889.899996] ieee80211_recalc_ps_vif+0x3c/0x48 [mac80211] [ 2889.900060] ieee80211_set_disassoc+0x84/0x43c [mac80211] [ 2889.900142] ieee80211_mgd_deauth+0x4e0/0x574 [mac80211] [ 2889.900201] ieee80211_deauth+0x24/0x30 [mac80211] [ 2889.900285] cfg80211_mlme_deauth+0x198/0x26c [cfg80211] [ 2889.900337] cfg80211_mlme_down+0x70/0x98 [cfg80211] [ 2889.900393] cfg80211_disconnect+0x214/0x250 [cfg80211] [ 2889.900441] __cfg80211_leave+0xc8/0x14c [cfg80211] [ 2889.900489] cfg80211_netdev_notifier_call+0x41c/0x668 [cfg80211] [ 2889.900515] raw_notifier_call_chain+0x48/0x70 [ 2889.900540] call_netdevice_notifiers_info+0x40/0x74 [ 2889.900556] __dev_close_many+0x6c/0x11c [ 2889.900572] dev_close_many+0x70/0x100 [ 2889.900587] dev_close+0x4c/0x80 [ 2889.900636] cfg80211_shutdown_all_interfaces+0x50/0xcc [cfg80211] [ 2889.900697] ieee80211_handle_reconfig_failure+0x90/0x9c [mac80211] [ 2889.900761] ieee80211_reconfig+0x18dc/0x19a4 [mac80211] [ 2889.900815] ieee80211_restart_work+0xa0/0xd0 [mac80211] [ 2889.900835] process_one_work+0x210/0x418 [ 2889.900855] worker_thread+0x234/0x3dc [ 2889.900871] kthread+0x120/0x130 [ 2889.900890] ret_from_fork+0x10/0x18 [ 2889.900905] ---[ end trace d605ce58e22a44f3 ]---
Ok thanks. I am still trying to familiarize myself with ath10k. A couple things come to mind: firstly recently there was a patch that set stations back to associated during recovery, do you know if the branch you're working against includes that? I will try to figure that out myself but it's at least worth considering. If I remember correctly, the sdio rx involves peeking and checking to see if the frame/packet continues/has more. In the case of firmware recovery/reconfig I think current code enables all sorts of debug pktlog stuff, historically it was maybe used as a hack to check for hw changes... But I mention it because I think what might be happening in some cases is fw crash follows a frame that would continue/extend, which is messing with the content and act_len (maybe?) I also noticed that with aggregation I wind up getting a deauth/dissasoc followed by mlme delete sta/peer/key. I think a retry count could be useful but what do you think of maybe using an event / timeout? On Thu, Jan 16, 2020 at 11:19 PM Wen Gong <wgong@codeaurora.org> wrote: > > On 2020-01-10 02:29, Wen Gong wrote: > > On 2020-01-08 20:02, Justin Capella wrote: > >> I think this might break the "wedged" state. > >> > >> Would simply not taking action unless STATE ON avoid the problems with > >> multiple calls to _restart? ie: > >> > > > > ath10k_core_restart is one part of the recovery process, > > after ath10k_core_restart, it has other things do in > > mac80211/ath10k..., > > it need to make sure all the recovery > > finished(ath10k_reconfig_complete), > > then the next recovery can start from ath10k_core_restart. > > Here is some log for the 2 recovery run meanwhile and failed. > It means if 2 recovery run meanwhile, recovery will easy failed. > > log steps: > 1. simulate crash and then firmware crash and enter ath10k_core_restart > with state=ATH10K_STATE_ON > 2. it happened "exceeds length, start recovery" and queued the 2nd > recovery > 3. mac80211 start to recovery ath10k and state changed from > ATH10K_STATE_RESTARTING to ATH10K_STATE_RESTARTED > 4. enter ath10k_core_restart for 2nd recovery and state changed from > ATH10K_STATE_RESTARTED to ATH10K_STATE_WEDGED > 5. mac80211 start to drv_add_interface but failed because > ath10k_htc_send return -ECOMM for ATH10K_STATE_WEDGED > 6. mac80211 start to ieee80211_handle_reconfig_failure because > drv_add_interface failed so the recovery failed. > > > kworker/0:1 13360 [000] 2889.292173: ath10k:ath10k_log_err: > ath10k_sdio mmc1:0001:1 firmware register dump: > kworker/u16:3 11486 [001] 2889.292686: > ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_core_restart => > state :1 > kworker/0:1 13360 [000] 2889.292760: ath10k:ath10k_log_warn: > ath10k_sdio mmc1:0001:1 payload length 57005 exceeds max htc length: > 4088 > kworker/0:1 13360 [000] 2889.292805: ath10k:ath10k_log_warn: > ath10k_sdio mmc1:0001:1 exceeds length, start recovery > kworker/1:3 14712 [001] 2889.850142: ath10k:ath10k_log_dbg: > ath10k_sdio mmc1:0001:1 wmi tlv init > kworker/1:3 14712 [001] 2889.866953: ath10k:ath10k_log_dbg: > ath10k_sdio mmc1:0001:1 wmi tlv scan prob req oui > kworker/u16:3 11486 [001] 2889.888801: > ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_core_restart => 2 > state :3 > kworker/u16:3 11486 [001] 2889.888808: > ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 device is wedged, will > not restart > kworker/1:3 14712 [001] 2889.889220: ath10k:ath10k_log_warn: > ath10k_sdio mmc1:0001:1 ath10k_add_interface => state :4 > kworker/1:3 14712 [001] 2889.889229: ath10k:ath10k_log_dbg: > ath10k_sdio mmc1:0001:1 mac create vdev 0 map f > kworker/1:3 14712 [001] 2889.889240: ath10k:ath10k_log_dbg: > ath10k_sdio mmc1:0001:1 mac vdev create 0 (add interface) type 2 subtype > 0 bcnmode per-skb > kworker/1:3 14712 [001] 2889.889247: ath10k:ath10k_log_dbg: > ath10k_sdio mmc1:0001:1 wmi tlv vdev create > kworker/1:3 14712 [001] 2889.889252: ath10k:ath10k_wmi_cmd: > ath10k_sdio mmc1:0001:1 id 20481 len 28 > kworker/1:3 14712 [001] 2889.889260: ath10k:ath10k_log_warn: > ath10k_sdio mmc1:0001:1 failed to create WMI vdev 0: -70 > > ieee80211_reconfig will ieee80211_handle_reconfig_failure because > drv_add_interface failed. > so the recovery failed. > > detail log for fail > [ 2889.410150] ieee80211 phy0: Hardware restart was requested > [ 2889.889281] drv_add_interface ret:-70 > [ 2889.889310] ------------[ cut here ]------------ > [ 2889.889547] WARNING: CPU: 1 PID: 14712 at > /mnt/host/source/src/third_party/kernel/v4.19/net/mac80211/util.c:2072 > ieee80211_reconfig+0x189c/0x19a4 [mac80211] > [ 2889.889559] Modules linked in: rfcomm uinput cros_ec_rpmsg mtk_dip > mtk_vcodec_enc mtk_seninf mtk_vcodec_dec mtk_mdp3 mtk_fd mtk_cam_isp > mtk_vcodec_common videobuf2_dma_contig videobuf2_memops v4l2_mem2mem > videobuf2_v4l2 videobuf2_common hci_uart btqca bluetooth > hid_google_hammer ecdh_generic ov8856 dw9768 mtk_scp mtk_rpmsg ov02a10 > rpmsg_core v4l2_fwnode mtk_scp_ipi nf_nat_tftp nf_conntrack_tftp > nf_nat_ftp nf_conntrack_ftp esp6 ah6 xfrm6_mode_tunnel > xfrm6_mode_transport xfrm4_mode_tunnel xfrm4_mode_transport ip6t_REJECT > ip6t_ipv6header ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring > cros_ec_light_prox cros_ec_sensors cros_ec_sensors_sync > industrialio_triggered_buffer kfifo_buf cros_ec_sensors_core ath10k_sdio > ath10k_core ath mac80211 lzo_rle lzo_compress zram cfg80211 ax88179_178a > usbnet mii > [ 2889.889689] joydev > [ 2889.889709] CPU: 1 PID: 14712 Comm: kworker/1:3 Tainted: G W > 4.19.95 #139 > [ 2889.889720] Hardware name: MediaTek krane sku176 board (DT) > [ 2889.889776] Workqueue: events_freezable ieee80211_restart_work > [mac80211] > [ 2889.889792] pstate: 60000005 (nZCv daif -PAN -UAO) > [ 2889.889849] pc : ieee80211_reconfig+0x189c/0x19a4 [mac80211] > [ 2889.889908] lr : ieee80211_reconfig+0x189c/0x19a4 [mac80211] > [ 2889.889918] sp : ffffff8012743cc0 > [ 2889.889927] x29: ffffff8012743d40 x28: ffffffa381e11f40 > [ 2889.889941] x27: ffffffa381ad4018 x26: 0000000000000000 > [ 2889.889954] x25: 00000000ffffffba x24: 0000000000000004 > [ 2889.889968] x23: ffffffe3b52a2210 x22: ffffffe3c3ae12b8 > [ 2889.889981] x21: ffffffe3c3ae12b8 x20: ffffffe3b4e80900 > [ 2889.889995] x19: ffffffe3c3ae0800 x18: 000000000003b900 > [ 2889.890008] x17: 000000000000003c x16: ffffffa3815c1540 > [ 2889.890021] x15: 0000000000000006 x14: ffff001000000600 > [ 2889.890034] x13: 00000000002ae45c x12: 0000000000000000 > [ 2889.890047] x11: 0000000000000001 x10: 0000000000000007 > [ 2889.890061] x9 : c6cfad4ea7c13400 x8 : c6cfad4ea7c13400 > [ 2889.890074] x7 : 0000000000000000 x6 : ffffffa381f6cd0c > [ 2889.890087] x5 : 0000000000000000 x4 : 0000000000000000 > [ 2889.890100] x3 : 000000000003e2a1 x2 : fffffffffffff704 > [ 2889.890113] x1 : 0000000000000000 x0 : 0000000000000024 > [ 2889.890127] Call trace: > [ 2889.890185] ieee80211_reconfig+0x189c/0x19a4 [mac80211] > [ 2889.890236] ieee80211_restart_work+0xa0/0xd0 [mac80211] > [ 2889.890258] process_one_work+0x210/0x418 > [ 2889.890273] worker_thread+0x234/0x3dc > [ 2889.890288] kthread+0x120/0x130 > [ 2889.890303] ret_from_fork+0x10/0x18 > [ 2889.890314] ---[ end trace d605ce58e22a44f2 ]--- > [ 2889.898956] wlan0: deauthenticating from c4:04:15:3b:e0:38 by local > choice (Reason: 3=DEAUTH_LEAVING) > [ 2889.898995] ------------[ cut here ]------------ > [ 2889.899006] wlan0: Failed check-sdata-in-driver check, flags: 0x0 > [ 2889.899289] WARNING: CPU: 1 PID: 14712 at > /mnt/host/source/src/third_party/kernel/v4.19/net/mac80211/driver-ops.h:19 > ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211] > [ 2889.899303] Modules linked in: rfcomm uinput cros_ec_rpmsg mtk_dip > mtk_vcodec_enc mtk_seninf mtk_vcodec_dec mtk_mdp3 mtk_fd mtk_cam_isp > mtk_vcodec_common videobuf2_dma_contig videobuf2_memops v4l2_mem2mem > videobuf2_v4l2 videobuf2_common hci_uart btqca bluetooth > hid_google_hammer ecdh_generic ov8856 dw9768 mtk_scp mtk_rpmsg ov02a10 > rpmsg_core v4l2_fwnode mtk_scp_ipi nf_nat_tftp nf_conntrack_tftp > nf_nat_ftp nf_conntrack_ftp esp6 ah6 xfrm6_mode_tunnel > xfrm6_mode_transport xfrm4_mode_tunnel xfrm4_mode_transport ip6t_REJECT > ip6t_ipv6header ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring > cros_ec_light_prox cros_ec_sensors cros_ec_sensors_sync > industrialio_triggered_buffer kfifo_buf cros_ec_sensors_core ath10k_sdio > ath10k_core ath mac80211 lzo_rle lzo_compress zram cfg80211 ax88179_178a > usbnet mii > [ 2889.899438] joydev > [ 2889.899475] CPU: 1 PID: 14712 Comm: kworker/1:3 Tainted: G W > 4.19.95 #139 > [ 2889.899486] Hardware name: MediaTek krane sku176 board (DT) > [ 2889.899545] Workqueue: events_freezable ieee80211_restart_work > [mac80211] > [ 2889.899561] pstate: 60000005 (nZCv daif -PAN -UAO) > [ 2889.899612] pc : ieee80211_bss_info_change_notify+0x23c/0x24c > [mac80211] > [ 2889.899664] lr : ieee80211_bss_info_change_notify+0x23c/0x24c > [mac80211] > [ 2889.899674] sp : ffffff8012743820 > [ 2889.899686] x29: ffffff8012743860 x28: ffffffa381e11f40 > [ 2889.899700] x27: ffffffa381ad4018 x26: 0000000000000000 > [ 2889.899712] x25: 0000000000000000 x24: 00000000000000c0 > [ 2889.899725] x23: 0000000000000003 x22: ffffff80127438fc > [ 2889.899738] x21: ffffffe3b4e80900 x20: 0000000000020000 > [ 2889.899751] x19: ffffffe3c3ae0800 x18: 000000000003c500 > [ 2889.899764] x17: 000000000000003c x16: ffffffa3815c1540 > [ 2889.899776] x15: 0000000000000006 x14: ffff001000000600 > [ 2889.899789] x13: 00000000002ae46e x12: 0000000000000000 > [ 2889.899802] x11: 0000000000000000 x10: 0000000000000000 > [ 2889.899814] x9 : c6cfad4ea7c13400 x8 : c6cfad4ea7c13400 > [ 2889.899827] x7 : 0000000000000000 x6 : ffffffa381f6cd1e > [ 2889.899840] x5 : 0000000000000000 x4 : 0000000000000000 > [ 2889.899857] x3 : 000000000003d73b x2 : ffffffe3fff05918 > [ 2889.899870] x1 : ffffffe3ffefca08 x0 : 0000000000000036 > [ 2889.899883] Call trace: > [ 2889.899935] ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211] > [ 2889.899996] ieee80211_recalc_ps_vif+0x3c/0x48 [mac80211] > [ 2889.900060] ieee80211_set_disassoc+0x84/0x43c [mac80211] > [ 2889.900142] ieee80211_mgd_deauth+0x4e0/0x574 [mac80211] > [ 2889.900201] ieee80211_deauth+0x24/0x30 [mac80211] > [ 2889.900285] cfg80211_mlme_deauth+0x198/0x26c [cfg80211] > [ 2889.900337] cfg80211_mlme_down+0x70/0x98 [cfg80211] > [ 2889.900393] cfg80211_disconnect+0x214/0x250 [cfg80211] > [ 2889.900441] __cfg80211_leave+0xc8/0x14c [cfg80211] > [ 2889.900489] cfg80211_netdev_notifier_call+0x41c/0x668 [cfg80211] > [ 2889.900515] raw_notifier_call_chain+0x48/0x70 > [ 2889.900540] call_netdevice_notifiers_info+0x40/0x74 > [ 2889.900556] __dev_close_many+0x6c/0x11c > [ 2889.900572] dev_close_many+0x70/0x100 > [ 2889.900587] dev_close+0x4c/0x80 > [ 2889.900636] cfg80211_shutdown_all_interfaces+0x50/0xcc [cfg80211] > [ 2889.900697] ieee80211_handle_reconfig_failure+0x90/0x9c [mac80211] > [ 2889.900761] ieee80211_reconfig+0x18dc/0x19a4 [mac80211] > [ 2889.900815] ieee80211_restart_work+0xa0/0xd0 [mac80211] > [ 2889.900835] process_one_work+0x210/0x418 > [ 2889.900855] worker_thread+0x234/0x3dc > [ 2889.900871] kthread+0x120/0x130 > [ 2889.900890] ret_from_fork+0x10/0x18 > [ 2889.900905] ---[ end trace d605ce58e22a44f3 ]---
On 2020-01-20 17:38, Justin Capella wrote: > Ok thanks. I am still trying to familiarize myself with ath10k. A > couple things come to mind: > > firstly recently there was a patch that set stations back to > associated during recovery, do you know if the branch you're working > against includes that? I will try to figure that out myself but it's > at least worth considering. > can you give the patch link or patch? > If I remember correctly, the sdio rx involves peeking and checking to > see if the frame/packet continues/has more. In the case of firmware > recovery/reconfig I think current code enables all sorts of debug > pktlog stuff, historically it was maybe used as a hack to check for hw > changes... But I mention it because I think what might be happening in > some cases is fw crash follows a frame that would continue/extend, > which is messing with the content and act_len (maybe?) does it have relation with this patch? > > I also noticed that with aggregation I wind up getting a > deauth/dissasoc followed by mlme delete sta/peer/key. > does it have relation with this patch? > I think a retry count could be useful but what do you think of maybe > using an event / timeout? > does it have relation with this patch? > >
Sorry, that response was intended for the start retries. On Mon, Jan 20, 2020 at 5:34 AM Wen Gong <wgong@codeaurora.org> wrote: > > On 2020-01-20 17:38, Justin Capella wrote: > > Ok thanks. I am still trying to familiarize myself with ath10k. A > > couple things come to mind: > > > > firstly recently there was a patch that set stations back to > > associated during recovery, do you know if the branch you're working > > against includes that? I will try to figure that out myself but it's > > at least worth considering. > > > can you give the patch link or patch? > > > If I remember correctly, the sdio rx involves peeking and checking to > > see if the frame/packet continues/has more. In the case of firmware > > recovery/reconfig I think current code enables all sorts of debug > > pktlog stuff, historically it was maybe used as a hack to check for hw > > changes... But I mention it because I think what might be happening in > > some cases is fw crash follows a frame that would continue/extend, > > which is messing with the content and act_len (maybe?) > does it have relation with this patch? > > > > I also noticed that with aggregation I wind up getting a > > deauth/dissasoc followed by mlme delete sta/peer/key. > > > does it have relation with this patch? > > I think a retry count could be useful but what do you think of maybe > > using an event / timeout? > > > does it have relation with this patch? > > > > >
Wen Gong <wgong@codeaurora.org> writes: > When it has more than one restart_work queued meanwhile, the 2nd > restart_work is very esay to break the 1st restart work and lead > recovery fail. > > Add a ref count to allow only one restart work running untill > device successfully recovered. > > Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00029. > > Signed-off-by: Wen Gong <wgong@codeaurora.org> > --- > drivers/net/wireless/ath/ath10k/core.c | 13 +++++++++++++ > drivers/net/wireless/ath/ath10k/core.h | 2 ++ > drivers/net/wireless/ath/ath10k/mac.c | 1 + > 3 files changed, 16 insertions(+) > > diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c > index 91f131b87efc..0e31846e6c89 100644 > --- a/drivers/net/wireless/ath/ath10k/core.c > +++ b/drivers/net/wireless/ath/ath10k/core.c > @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct work_struct *work) > { > struct ath10k *ar = container_of(work, struct ath10k, restart_work); > int ret; > + int restart_count; > + > + restart_count = atomic_add_return(1, &ar->restart_count); > + if (restart_count > 1) { > + ath10k_warn(ar, "can not restart, count: %d\n", restart_count); > + atomic_dec(&ar->restart_count); > + return; > + } I have been thinking a different approach for this. I think another option is to have a function like this: ath10k_core_firmware_crashed() { queue_work(ar->workqueue, &ar->restart_work); } In patch 1 we would convert all existing callers to call that function instead of queue_work() directly. In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet which one is better. Now the function would do: ath10k_core_firmware_crashed() { if (test_bit(flag)) return set_bit(flag) queue_work(ar->workqueue, &ar->restart_work); } That way restart_work queue would be called only one time. Though I'm not sure how ATH10K_STATE_WEDGED would behave after this change, it might get broken. Ah, actually I think even this patch breaks the WEDGED state. This firmware restart is tricky, difficult to say what is the best approach. Michal, are you reading? :) Any ideas? And after looking more about this patch I don't see the need for the new ar->restart_count atomic variable. Checking for ATH10K_FLAG_CRASH_FLUSH would do the same thing AFAICS. And related to this, (in a separate patch) I think we should utilise ATH10K_FLAG_CRASH_FLUSH more. For example in ath10k_wmi_cmd_send() to not even try to send a WMI command if the flag is set. Basically all hardware access should be disabled except what is needed to restart the firmware.
On 2020-08-15 01:19, Kalle Valo wrote: > Wen Gong <wgong@codeaurora.org> writes: > ... >> diff --git a/drivers/net/wireless/ath/ath10k/core.c >> b/drivers/net/wireless/ath/ath10k/core.c >> index 91f131b87efc..0e31846e6c89 100644 >> --- a/drivers/net/wireless/ath/ath10k/core.c >> +++ b/drivers/net/wireless/ath/ath10k/core.c >> @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct >> work_struct *work) >> { >> struct ath10k *ar = container_of(work, struct ath10k, restart_work); >> int ret; >> + int restart_count; >> + >> + restart_count = atomic_add_return(1, &ar->restart_count); >> + if (restart_count > 1) { >> + ath10k_warn(ar, "can not restart, count: %d\n", restart_count); >> + atomic_dec(&ar->restart_count); >> + return; >> + } > > I have been thinking a different approach for this. I think another > option is to have a function like this: > > ath10k_core_firmware_crashed() > { > queue_work(ar->workqueue, &ar->restart_work); > } > > In patch 1 we would convert all existing callers to call that > function instead of queue_work() directly. > > In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe > should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet > which one is better. Now the function would do: > > ath10k_core_firmware_crashed() > { > if (test_bit(flag)) > return > > set_bit(flag) > queue_work(ar->workqueue, &ar->restart_work); > } > > That way restart_work queue would be called only one time. > This is not muti-thread-safe, for example, if 2 thread entered to the test_bit(flag) meanwhile and both check pass, then it will have 2 restart. atomic_add_return is muti-thread-safe, if 2 thread entered it, only 1 thread can pass the check, another will fail and return. The "payload length exceeds max htc length for sdio" happened many times in a very short time, so I add this check for it. > Though I'm not sure how ATH10K_STATE_WEDGED would behave after this > change, it might get broken. Ah, actually I think even this patch > breaks > the WEDGED state. This firmware restart is tricky, difficult to say > what > is the best approach. Michal, are you reading? :) Any ideas? > > And after looking more about this patch I don't see the need for the > new > ar->restart_count atomic variable. Checking for ATH10K_FLAG_CRASH_FLUSH > would do the same thing AFAICS. > > And related to this, (in a separate patch) I think we should utilise > ATH10K_FLAG_CRASH_FLUSH more. For example in ath10k_wmi_cmd_send() to > not even try to send a WMI command if the flag is set. Basically all > hardware access should be disabled except what is needed to restart the > firmware.
On 2020-08-15 01:19, Kalle Valo wrote: ... > > I have been thinking a different approach for this. I think another > option is to have a function like this: > > ath10k_core_firmware_crashed() > { > queue_work(ar->workqueue, &ar->restart_work); > } > > In patch 1 we would convert all existing callers to call that > function instead of queue_work() directly. > > In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe > should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet > which one is better. Now the function would do: > > ath10k_core_firmware_crashed() > { > if (test_bit(flag)) > return > > set_bit(flag) > queue_work(ar->workqueue, &ar->restart_work); > } > It is better to clear_bit ATH10K_FLAG_CRASH_FLUSH/new flag in ath10k_reconfig_complete instead of ath10k_core_start because ieee80211_reconfig(called by ieee80211_restart_work) of mac80211 do many things and drv_start is 1st thing and drv_reconfig_complete is last thing. > That way restart_work queue would be called only one time. > > Though I'm not sure how ATH10K_STATE_WEDGED would behave after this > change, it might get broken. Ah, actually I think even this patch > breaks > the WEDGED state. This firmware restart is tricky, difficult to say > what > is the best approach. Michal, are you reading? :) Any ideas? > > And after looking more about this patch I don't see the need for the > new > ar->restart_count atomic variable. Checking for ATH10K_FLAG_CRASH_FLUSH > would do the same thing AFAICS. > > And related to this, (in a separate patch) I think we should utilise > ATH10K_FLAG_CRASH_FLUSH more. For example in ath10k_wmi_cmd_send() to > not even try to send a WMI command if the flag is set. Basically all > hardware access should be disabled except what is needed to restart the > firmware.
On 2020-08-15 01:19, Kalle Valo wrote: ... > > I have been thinking a different approach for this. I think another > option is to have a function like this: > > ath10k_core_firmware_crashed() > { > queue_work(ar->workqueue, &ar->restart_work); > } > > In patch 1 we would convert all existing callers to call that > function instead of queue_work() directly. > > In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe > should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet > which one is better. Now the function would do: I thinks we can use test_and_set_bit for atomic operation athough it is same with restart_count. and add a new flag, ATH10K_FLAG_CRASH_FLUSH is used for flush, if still use ATH10K_FLAG_CRASH_FLUSH, it should change clear_bit of it from ath10k_core_start to ath10k_reconfig_complete,because ieee80211_reconfig(called by ieee80211_restart_work) of mac80211 do many things and drv_start is 1st thing and drv_reconfig_complete is last thing, drv_reconfig_complete done means the restart finished. I will send patch v5 with above changes if not other advise. > > ath10k_core_firmware_crashed() > { > if (test_bit(flag)) > return > > set_bit(flag) > queue_work(ar->workqueue, &ar->restart_work); > } > > That way restart_work queue would be called only one time. > > Though I'm not sure how ATH10K_STATE_WEDGED would behave after this > change, it might get broken. Ah, actually I think even this patch > breaks > the WEDGED state. This firmware restart is tricky, difficult to say > what > is the best approach. Michal, are you reading? :) Any ideas? > > And after looking more about this patch I don't see the need for the > new > ar->restart_count atomic variable. Checking for ATH10K_FLAG_CRASH_FLUSH > would do the same thing AFAICS. > > And related to this, (in a separate patch) I think we should utilise > ATH10K_FLAG_CRASH_FLUSH more. For example in ath10k_wmi_cmd_send() to > not even try to send a WMI command if the flag is set. Basically all > hardware access should be disabled except what is needed to restart the > firmware.
On 2020-08-20 17:18, Wen Gong wrote: ... > I thinks we can use test_and_set_bit for atomic operation athough it > is same with restart_count. > and add a new flag, ATH10K_FLAG_CRASH_FLUSH is used for flush, > if still use ATH10K_FLAG_CRASH_FLUSH, it should change clear_bit of it > from > ath10k_core_start to ath10k_reconfig_complete,because > ieee80211_reconfig(called by > ieee80211_restart_work) > of mac80211 do many things and drv_start is 1st thing and > drv_reconfig_complete is last thing, drv_reconfig_complete done means > the restart > finished. > > I will send patch v5 with above changes if not other advise. v5 sent: https://patchwork.kernel.org/patch/11728101/ ...
Wen Gong <wgong@codeaurora.org> writes: > On 2020-08-15 01:19, Kalle Valo wrote: >> Wen Gong <wgong@codeaurora.org> writes: >> > ... >>> diff --git a/drivers/net/wireless/ath/ath10k/core.c >>> b/drivers/net/wireless/ath/ath10k/core.c >>> index 91f131b87efc..0e31846e6c89 100644 >>> --- a/drivers/net/wireless/ath/ath10k/core.c >>> +++ b/drivers/net/wireless/ath/ath10k/core.c >>> @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct >>> work_struct *work) >>> { >>> struct ath10k *ar = container_of(work, struct ath10k, restart_work); >>> int ret; >>> + int restart_count; >>> + >>> + restart_count = atomic_add_return(1, &ar->restart_count); >>> + if (restart_count > 1) { >>> + ath10k_warn(ar, "can not restart, count: %d\n", restart_count); >>> + atomic_dec(&ar->restart_count); >>> + return; >>> + } >> >> I have been thinking a different approach for this. I think another >> option is to have a function like this: >> >> ath10k_core_firmware_crashed() >> { >> queue_work(ar->workqueue, &ar->restart_work); >> } >> >> In patch 1 we would convert all existing callers to call that >> function instead of queue_work() directly. >> >> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe >> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet >> which one is better. Now the function would do: >> >> ath10k_core_firmware_crashed() >> { >> if (test_bit(flag)) >> return >> >> set_bit(flag) >> queue_work(ar->workqueue, &ar->restart_work); >> } >> >> That way restart_work queue would be called only one time. > > This is not muti-thread-safe, for example, if 2 thread entered to the > test_bit(flag) meanwhile and both check pass, then it will have 2 > restart. Good point, this was racy. And I see that you found test_and_set_bit() already to fix the race. > atomic_add_return is muti-thread-safe, if 2 thread entered it, only 1 > thread can pass the check, another will fail and return. I'm not going to add new state variables unless the justification is REALLY strong and sound. This firmware restart is complicated as is already, there's no reason to complicate it even more.
Wen Gong <wgong@codeaurora.org> writes: > On 2020-08-15 01:19, Kalle Valo wrote: > ... >> >> I have been thinking a different approach for this. I think another >> option is to have a function like this: >> >> ath10k_core_firmware_crashed() >> { >> queue_work(ar->workqueue, &ar->restart_work); >> } >> >> In patch 1 we would convert all existing callers to call that >> function instead of queue_work() directly. >> >> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe >> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet >> which one is better. Now the function would do: > > I thinks we can use test_and_set_bit for atomic operation athough it > is same with restart_count. I didn't quite understand what you mean here, but using test_and_set_bit() is the correct methdo. > and add a new flag, ATH10K_FLAG_CRASH_FLUSH is used for flush, if > still use ATH10K_FLAG_CRASH_FLUSH, it should change clear_bit of it > from ath10k_core_start to ath10k_reconfig_complete,because > ieee80211_reconfig(called by ieee80211_restart_work) of mac80211 do > many things and drv_start is 1st thing and drv_reconfig_complete is > last thing, drv_reconfig_complete done means the restart finished. Ok, let's discuss about that in v5. I hope you added the analysis to the commit log.
On 2020-09-07 23:55, Kalle Valo wrote: > Wen Gong <wgong@codeaurora.org> writes: > >> On 2020-08-15 01:19, Kalle Valo wrote: >> ... >>> >>> I have been thinking a different approach for this. I think another >>> option is to have a function like this: >>> >>> ath10k_core_firmware_crashed() >>> { >>> queue_work(ar->workqueue, &ar->restart_work); >>> } >>> >>> In patch 1 we would convert all existing callers to call that >>> function instead of queue_work() directly. >>> >>> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe >>> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet >>> which one is better. Now the function would do: >> >> I thinks we can use test_and_set_bit for atomic operation athough it >> is same with restart_count. > > I didn't quite understand what you mean here, but using > test_and_set_bit() is the correct methdo. > >> and add a new flag, ATH10K_FLAG_CRASH_FLUSH is used for flush, if >> still use ATH10K_FLAG_CRASH_FLUSH, it should change clear_bit of it >> from ath10k_core_start to ath10k_reconfig_complete,because >> ieee80211_reconfig(called by ieee80211_restart_work) of mac80211 do >> many things and drv_start is 1st thing and drv_reconfig_complete is >> last thing, drv_reconfig_complete done means the restart finished. > > Ok, let's discuss about that in v5. I hope you added the analysis to > the > commit log. yes, v5 have alread sent in https://patchwork.kernel.org/patch/11728101/ [v5] ath10k: add atomic protection for device recovery
diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 91f131b87efc..0e31846e6c89 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct work_struct *work) { struct ath10k *ar = container_of(work, struct ath10k, restart_work); int ret; + int restart_count; + + restart_count = atomic_add_return(1, &ar->restart_count); + if (restart_count > 1) { + ath10k_warn(ar, "can not restart, count: %d\n", restart_count); + atomic_dec(&ar->restart_count); + return; + } set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags); @@ -2231,6 +2239,11 @@ static void ath10k_core_restart(struct work_struct *work) mutex_lock(&ar->conf_mutex); + if (ar->state != ATH10K_STATE_ON) { + ath10k_warn(ar, "state is not on: %d\n", ar->state); + atomic_dec(&ar->restart_count); + } + switch (ar->state) { case ATH10K_STATE_ON: ar->state = ATH10K_STATE_RESTARTING; diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index e57b2e7235e3..810c99f2dc0e 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -982,6 +982,8 @@ struct ath10k { /* protected by conf_mutex */ u8 ps_state_enable; + atomic_t restart_count; + bool nlo_enabled; bool p2p; diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 3856edba7915..bc1574145e66 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -7208,6 +7208,7 @@ static void ath10k_reconfig_complete(struct ieee80211_hw *hw, ath10k_info(ar, "device successfully recovered\n"); ar->state = ATH10K_STATE_ON; ieee80211_wake_queues(ar->hw); + atomic_dec(&ar->restart_count); } mutex_unlock(&ar->conf_mutex);
When it has more than one restart_work queued meanwhile, the 2nd restart_work is very esay to break the 1st restart work and lead recovery fail. Add a ref count to allow only one restart work running untill device successfully recovered. Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00029. Signed-off-by: Wen Gong <wgong@codeaurora.org> --- drivers/net/wireless/ath/ath10k/core.c | 13 +++++++++++++ drivers/net/wireless/ath/ath10k/core.h | 2 ++ drivers/net/wireless/ath/ath10k/mac.c | 1 + 3 files changed, 16 insertions(+)