diff mbox series

[v4,1/2] ath10k: add refcount for ath10k_core_restart

Message ID 20200108031957.22308-2-wgong@codeaurora.org (mailing list archive)
State Changes Requested
Delegated to: Kalle Valo
Headers show
Series start recovery process when payload length overflow for sdio | expand

Commit Message

Wen Gong Jan. 8, 2020, 3:19 a.m. UTC
When it has more than one restart_work queued meanwhile, the 2nd
restart_work is very esay to break the 1st restart work and lead
recovery fail.

Add a ref count to allow only one restart work running untill
device successfully recovered.

Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00029.

Signed-off-by: Wen Gong <wgong@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.c | 13 +++++++++++++
 drivers/net/wireless/ath/ath10k/core.h |  2 ++
 drivers/net/wireless/ath/ath10k/mac.c  |  1 +
 3 files changed, 16 insertions(+)

Comments

Justin Capella Jan. 8, 2020, 12:02 p.m. UTC | #1
I think this might break the "wedged" state.

Would simply not taking action unless STATE ON avoid the problems with
multiple calls to _restart? ie:

diff --git a/drivers/net/wireless/ath/ath10k/core.c
b/drivers/net/wireless/ath/ath10k/core.c
index 5ec16ce19b69..a6c11b2bc97c 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -2198,11 +2198,8 @@ static int ath10k_init_hw_params(struct ath10k *ar)
  return 0;
 }

-static void ath10k_core_restart(struct work_struct *work)
+static void inline _ath10k_core_restart(struct ath10k *ar)
 {
- struct ath10k *ar = container_of(work, struct ath10k, restart_work);
- int ret;
-
  set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags);

  /* Place a barrier to make sure the compiler doesn't reorder
@@ -2232,14 +2229,28 @@ static void ath10k_core_restart(struct
work_struct *work)
  */
  cancel_work_sync(&ar->set_coverage_class_work);

+ ath10k_halt(ar);
+ ath10k_scan_finish(ar);
+ ieee80211_restart_hw(ar->hw);
+
+ ret = ath10k_coredump_submit(ar);
+ if (ret)
+ ath10k_warn(ar, "failed to send firmware crash dump via devcoredump:
%d", ret);
+
+ complete(&ar->driver_recovery);
+}
+
+static void ath10k_core_restart(struct work_struct *work)
+{
+ struct ath10k *ar = container_of(work, struct ath10k, restart_work);
+ int ret;
+
  mutex_lock(&ar->conf_mutex);

  switch (ar->state) {
  case ATH10K_STATE_ON:
  ar->state = ATH10K_STATE_RESTARTING;
- ath10k_halt(ar);
- ath10k_scan_finish(ar);
- ieee80211_restart_hw(ar->hw);
+ _ath10k_core_restart(ar);
  break;
  case ATH10K_STATE_OFF:
  /* this can happen if driver is being unloaded
@@ -2262,13 +2273,6 @@ static void ath10k_core_restart(struct work_struct *work)
  }

  mutex_unlock(&ar->conf_mutex);
-
- ret = ath10k_coredump_submit(ar);
- if (ret)
- ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: %d",
-     ret);
-
- complete(&ar->driver_recovery);
 }

 static void ath10k_core_set_coverage_class_work(struct work_struct *work)

On Tue, Jan 7, 2020 at 7:20 PM Wen Gong <wgong@codeaurora.org> wrote:
>
> When it has more than one restart_work queued meanwhile, the 2nd
> restart_work is very esay to break the 1st restart work and lead
> recovery fail.
>
> Add a ref count to allow only one restart work running untill
> device successfully recovered.
>
> Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00029.
>
> Signed-off-by: Wen Gong <wgong@codeaurora.org>
> ---
>  drivers/net/wireless/ath/ath10k/core.c | 13 +++++++++++++
>  drivers/net/wireless/ath/ath10k/core.h |  2 ++
>  drivers/net/wireless/ath/ath10k/mac.c  |  1 +
>  3 files changed, 16 insertions(+)
>
> diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
> index 91f131b87efc..0e31846e6c89 100644
> --- a/drivers/net/wireless/ath/ath10k/core.c
> +++ b/drivers/net/wireless/ath/ath10k/core.c
> @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct work_struct *work)
>  {
>         struct ath10k *ar = container_of(work, struct ath10k, restart_work);
>         int ret;
> +       int restart_count;
> +
> +       restart_count = atomic_add_return(1, &ar->restart_count);
> +       if (restart_count > 1) {
> +               ath10k_warn(ar, "can not restart, count: %d\n", restart_count);
> +               atomic_dec(&ar->restart_count);
> +               return;
> +       }
>
>         set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags);
>
> @@ -2231,6 +2239,11 @@ static void ath10k_core_restart(struct work_struct *work)
>
>         mutex_lock(&ar->conf_mutex);
>
> +       if (ar->state != ATH10K_STATE_ON) {
> +               ath10k_warn(ar, "state is not on: %d\n", ar->state);
> +               atomic_dec(&ar->restart_count);
> +       }
> +
>         switch (ar->state) {
>         case ATH10K_STATE_ON:
>                 ar->state = ATH10K_STATE_RESTARTING;
> diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
> index e57b2e7235e3..810c99f2dc0e 100644
> --- a/drivers/net/wireless/ath/ath10k/core.h
> +++ b/drivers/net/wireless/ath/ath10k/core.h
> @@ -982,6 +982,8 @@ struct ath10k {
>         /* protected by conf_mutex */
>         u8 ps_state_enable;
>
> +       atomic_t restart_count;
> +
>         bool nlo_enabled;
>         bool p2p;
>
> diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
> index 3856edba7915..bc1574145e66 100644
> --- a/drivers/net/wireless/ath/ath10k/mac.c
> +++ b/drivers/net/wireless/ath/ath10k/mac.c
> @@ -7208,6 +7208,7 @@ static void ath10k_reconfig_complete(struct ieee80211_hw *hw,
>                 ath10k_info(ar, "device successfully recovered\n");
>                 ar->state = ATH10K_STATE_ON;
>                 ieee80211_wake_queues(ar->hw);
> +               atomic_dec(&ar->restart_count);
>         }
>
>         mutex_unlock(&ar->conf_mutex);
> --
> 2.23.0
Wen Gong Jan. 10, 2020, 10:29 a.m. UTC | #2
On 2020-01-08 20:02, Justin Capella wrote:
> I think this might break the "wedged" state.
> 
> Would simply not taking action unless STATE ON avoid the problems with
> multiple calls to _restart? ie:
> 

ath10k_core_restart is one part of the recovery process,
after ath10k_core_restart, it has other things do in mac80211/ath10k...,
it need to make sure all the recovery 
finished(ath10k_reconfig_complete),
then the next recovery can start from ath10k_core_restart.

so it can not simply change like below.
> diff --git a/drivers/net/wireless/ath/ath10k/core.c
> b/drivers/net/wireless/ath/ath10k/core.c
> index 5ec16ce19b69..a6c11b2bc97c 100644
> --- a/drivers/net/wireless/ath/ath10k/core.c
> +++ b/drivers/net/wireless/ath/ath10k/core.c
> @@ -2198,11 +2198,8 @@ static int ath10k_init_hw_params(struct ath10k 
> *ar)
>   return 0;
>  }
> 
> -static void ath10k_core_restart(struct work_struct *work)
> +static void inline _ath10k_core_restart(struct ath10k *ar)
>  {
> - struct ath10k *ar = container_of(work, struct ath10k, restart_work);
> - int ret;
> -
>   set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags);
> 
>   /* Place a barrier to make sure the compiler doesn't reorder
> @@ -2232,14 +2229,28 @@ static void ath10k_core_restart(struct
> work_struct *work)
>   */
>   cancel_work_sync(&ar->set_coverage_class_work);
> 
> + ath10k_halt(ar);
> + ath10k_scan_finish(ar);
> + ieee80211_restart_hw(ar->hw);
> +
> + ret = ath10k_coredump_submit(ar);
> + if (ret)
> + ath10k_warn(ar, "failed to send firmware crash dump via devcoredump:
> %d", ret);
> +
> + complete(&ar->driver_recovery);
> +}
> +
> +static void ath10k_core_restart(struct work_struct *work)
> +{
> + struct ath10k *ar = container_of(work, struct ath10k, restart_work);
> + int ret;
> +
>   mutex_lock(&ar->conf_mutex);
> 
>   switch (ar->state) {
>   case ATH10K_STATE_ON:
>   ar->state = ATH10K_STATE_RESTARTING;
> - ath10k_halt(ar);
> - ath10k_scan_finish(ar);
> - ieee80211_restart_hw(ar->hw);
> + _ath10k_core_restart(ar);
>   break;
>   case ATH10K_STATE_OFF:
>   /* this can happen if driver is being unloaded
> @@ -2262,13 +2273,6 @@ static void ath10k_core_restart(struct 
> work_struct *work)
>   }
> 
>   mutex_unlock(&ar->conf_mutex);
> -
> - ret = ath10k_coredump_submit(ar);
> - if (ret)
> - ath10k_warn(ar, "failed to send firmware crash dump via devcoredump: 
> %d",
> -     ret);
> -
> - complete(&ar->driver_recovery);
>  }
> 
>  static void ath10k_core_set_coverage_class_work(struct work_struct 
> *work)
> 
> On Tue, Jan 7, 2020 at 7:20 PM Wen Gong <wgong@codeaurora.org> wrote:
Wen Gong Jan. 17, 2020, 7:19 a.m. UTC | #3
On 2020-01-10 02:29, Wen Gong wrote:
> On 2020-01-08 20:02, Justin Capella wrote:
>> I think this might break the "wedged" state.
>> 
>> Would simply not taking action unless STATE ON avoid the problems with
>> multiple calls to _restart? ie:
>> 
> 
> ath10k_core_restart is one part of the recovery process,
> after ath10k_core_restart, it has other things do in 
> mac80211/ath10k...,
> it need to make sure all the recovery 
> finished(ath10k_reconfig_complete),
> then the next recovery can start from ath10k_core_restart.

Here is some log for the 2 recovery run meanwhile and failed.
It means if 2 recovery run meanwhile, recovery will easy failed.

log steps:
1. simulate crash and then firmware crash and enter ath10k_core_restart 
with state=ATH10K_STATE_ON
2. it happened "exceeds length, start recovery" and queued the 2nd 
recovery
3. mac80211 start to recovery ath10k and state changed from 
ATH10K_STATE_RESTARTING to ATH10K_STATE_RESTARTED
4. enter ath10k_core_restart for 2nd recovery and state changed from 
ATH10K_STATE_RESTARTED to ATH10K_STATE_WEDGED
5. mac80211 start to drv_add_interface but failed because 
ath10k_htc_send return -ECOMM for ATH10K_STATE_WEDGED
6. mac80211 start to ieee80211_handle_reconfig_failure because 
drv_add_interface failed so the recovery failed.


kworker/0:1 13360 [000]  2889.292173:            ath10k:ath10k_log_err: 
ath10k_sdio mmc1:0001:1 firmware register dump:
kworker/u16:3 11486 [001]  2889.292686:           
ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_core_restart => 
state :1
kworker/0:1 13360 [000]  2889.292760:           ath10k:ath10k_log_warn: 
ath10k_sdio mmc1:0001:1 payload length 57005 exceeds max htc length: 
4088
kworker/0:1 13360 [000]  2889.292805:           ath10k:ath10k_log_warn: 
ath10k_sdio mmc1:0001:1 exceeds length, start recovery
kworker/1:3 14712 [001]  2889.850142:            ath10k:ath10k_log_dbg: 
ath10k_sdio mmc1:0001:1 wmi tlv init
kworker/1:3 14712 [001]  2889.866953:            ath10k:ath10k_log_dbg: 
ath10k_sdio mmc1:0001:1 wmi tlv scan prob req oui
kworker/u16:3 11486 [001]  2889.888801:           
ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_core_restart => 2 
state :3
kworker/u16:3 11486 [001]  2889.888808:           
ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 device is wedged, will 
not restart
kworker/1:3 14712 [001]  2889.889220:           ath10k:ath10k_log_warn: 
ath10k_sdio mmc1:0001:1 ath10k_add_interface => state :4
kworker/1:3 14712 [001]  2889.889229:            ath10k:ath10k_log_dbg: 
ath10k_sdio mmc1:0001:1 mac create vdev 0 map f
kworker/1:3 14712 [001]  2889.889240:            ath10k:ath10k_log_dbg: 
ath10k_sdio mmc1:0001:1 mac vdev create 0 (add interface) type 2 subtype 
0 bcnmode per-skb
kworker/1:3 14712 [001]  2889.889247:            ath10k:ath10k_log_dbg: 
ath10k_sdio mmc1:0001:1 wmi tlv vdev create
kworker/1:3 14712 [001]  2889.889252:            ath10k:ath10k_wmi_cmd: 
ath10k_sdio mmc1:0001:1 id 20481 len 28
kworker/1:3 14712 [001]  2889.889260:           ath10k:ath10k_log_warn: 
ath10k_sdio mmc1:0001:1 failed to create WMI vdev 0: -70

ieee80211_reconfig will ieee80211_handle_reconfig_failure because 
drv_add_interface failed.
so the recovery failed.

detail log for fail
[ 2889.410150] ieee80211 phy0: Hardware restart was requested
[ 2889.889281] drv_add_interface ret:-70
[ 2889.889310] ------------[ cut here ]------------
[ 2889.889547] WARNING: CPU: 1 PID: 14712 at 
/mnt/host/source/src/third_party/kernel/v4.19/net/mac80211/util.c:2072 
ieee80211_reconfig+0x189c/0x19a4 [mac80211]
[ 2889.889559] Modules linked in: rfcomm uinput cros_ec_rpmsg mtk_dip 
mtk_vcodec_enc mtk_seninf mtk_vcodec_dec mtk_mdp3 mtk_fd mtk_cam_isp 
mtk_vcodec_common videobuf2_dma_contig videobuf2_memops v4l2_mem2mem 
videobuf2_v4l2 videobuf2_common hci_uart btqca bluetooth 
hid_google_hammer ecdh_generic ov8856 dw9768 mtk_scp mtk_rpmsg ov02a10 
rpmsg_core v4l2_fwnode mtk_scp_ipi nf_nat_tftp nf_conntrack_tftp 
nf_nat_ftp nf_conntrack_ftp esp6 ah6 xfrm6_mode_tunnel 
xfrm6_mode_transport xfrm4_mode_tunnel xfrm4_mode_transport ip6t_REJECT 
ip6t_ipv6header ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring 
cros_ec_light_prox cros_ec_sensors cros_ec_sensors_sync 
industrialio_triggered_buffer kfifo_buf cros_ec_sensors_core ath10k_sdio 
ath10k_core ath mac80211 lzo_rle lzo_compress zram cfg80211 ax88179_178a 
usbnet mii
[ 2889.889689]  joydev
[ 2889.889709] CPU: 1 PID: 14712 Comm: kworker/1:3 Tainted: G        W   
       4.19.95 #139
[ 2889.889720] Hardware name: MediaTek krane sku176 board (DT)
[ 2889.889776] Workqueue: events_freezable ieee80211_restart_work 
[mac80211]
[ 2889.889792] pstate: 60000005 (nZCv daif -PAN -UAO)
[ 2889.889849] pc : ieee80211_reconfig+0x189c/0x19a4 [mac80211]
[ 2889.889908] lr : ieee80211_reconfig+0x189c/0x19a4 [mac80211]
[ 2889.889918] sp : ffffff8012743cc0
[ 2889.889927] x29: ffffff8012743d40 x28: ffffffa381e11f40
[ 2889.889941] x27: ffffffa381ad4018 x26: 0000000000000000
[ 2889.889954] x25: 00000000ffffffba x24: 0000000000000004
[ 2889.889968] x23: ffffffe3b52a2210 x22: ffffffe3c3ae12b8
[ 2889.889981] x21: ffffffe3c3ae12b8 x20: ffffffe3b4e80900
[ 2889.889995] x19: ffffffe3c3ae0800 x18: 000000000003b900
[ 2889.890008] x17: 000000000000003c x16: ffffffa3815c1540
[ 2889.890021] x15: 0000000000000006 x14: ffff001000000600
[ 2889.890034] x13: 00000000002ae45c x12: 0000000000000000
[ 2889.890047] x11: 0000000000000001 x10: 0000000000000007
[ 2889.890061] x9 : c6cfad4ea7c13400 x8 : c6cfad4ea7c13400
[ 2889.890074] x7 : 0000000000000000 x6 : ffffffa381f6cd0c
[ 2889.890087] x5 : 0000000000000000 x4 : 0000000000000000
[ 2889.890100] x3 : 000000000003e2a1 x2 : fffffffffffff704
[ 2889.890113] x1 : 0000000000000000 x0 : 0000000000000024
[ 2889.890127] Call trace:
[ 2889.890185]  ieee80211_reconfig+0x189c/0x19a4 [mac80211]
[ 2889.890236]  ieee80211_restart_work+0xa0/0xd0 [mac80211]
[ 2889.890258]  process_one_work+0x210/0x418
[ 2889.890273]  worker_thread+0x234/0x3dc
[ 2889.890288]  kthread+0x120/0x130
[ 2889.890303]  ret_from_fork+0x10/0x18
[ 2889.890314] ---[ end trace d605ce58e22a44f2 ]---
[ 2889.898956] wlan0: deauthenticating from c4:04:15:3b:e0:38 by local 
choice (Reason: 3=DEAUTH_LEAVING)
[ 2889.898995] ------------[ cut here ]------------
[ 2889.899006] wlan0:  Failed check-sdata-in-driver check, flags: 0x0
[ 2889.899289] WARNING: CPU: 1 PID: 14712 at 
/mnt/host/source/src/third_party/kernel/v4.19/net/mac80211/driver-ops.h:19 
ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211]
[ 2889.899303] Modules linked in: rfcomm uinput cros_ec_rpmsg mtk_dip 
mtk_vcodec_enc mtk_seninf mtk_vcodec_dec mtk_mdp3 mtk_fd mtk_cam_isp 
mtk_vcodec_common videobuf2_dma_contig videobuf2_memops v4l2_mem2mem 
videobuf2_v4l2 videobuf2_common hci_uart btqca bluetooth 
hid_google_hammer ecdh_generic ov8856 dw9768 mtk_scp mtk_rpmsg ov02a10 
rpmsg_core v4l2_fwnode mtk_scp_ipi nf_nat_tftp nf_conntrack_tftp 
nf_nat_ftp nf_conntrack_ftp esp6 ah6 xfrm6_mode_tunnel 
xfrm6_mode_transport xfrm4_mode_tunnel xfrm4_mode_transport ip6t_REJECT 
ip6t_ipv6header ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring 
cros_ec_light_prox cros_ec_sensors cros_ec_sensors_sync 
industrialio_triggered_buffer kfifo_buf cros_ec_sensors_core ath10k_sdio 
ath10k_core ath mac80211 lzo_rle lzo_compress zram cfg80211 ax88179_178a 
usbnet mii
[ 2889.899438]  joydev
[ 2889.899475] CPU: 1 PID: 14712 Comm: kworker/1:3 Tainted: G        W   
       4.19.95 #139
[ 2889.899486] Hardware name: MediaTek krane sku176 board (DT)
[ 2889.899545] Workqueue: events_freezable ieee80211_restart_work 
[mac80211]
[ 2889.899561] pstate: 60000005 (nZCv daif -PAN -UAO)
[ 2889.899612] pc : ieee80211_bss_info_change_notify+0x23c/0x24c 
[mac80211]
[ 2889.899664] lr : ieee80211_bss_info_change_notify+0x23c/0x24c 
[mac80211]
[ 2889.899674] sp : ffffff8012743820
[ 2889.899686] x29: ffffff8012743860 x28: ffffffa381e11f40
[ 2889.899700] x27: ffffffa381ad4018 x26: 0000000000000000
[ 2889.899712] x25: 0000000000000000 x24: 00000000000000c0
[ 2889.899725] x23: 0000000000000003 x22: ffffff80127438fc
[ 2889.899738] x21: ffffffe3b4e80900 x20: 0000000000020000
[ 2889.899751] x19: ffffffe3c3ae0800 x18: 000000000003c500
[ 2889.899764] x17: 000000000000003c x16: ffffffa3815c1540
[ 2889.899776] x15: 0000000000000006 x14: ffff001000000600
[ 2889.899789] x13: 00000000002ae46e x12: 0000000000000000
[ 2889.899802] x11: 0000000000000000 x10: 0000000000000000
[ 2889.899814] x9 : c6cfad4ea7c13400 x8 : c6cfad4ea7c13400
[ 2889.899827] x7 : 0000000000000000 x6 : ffffffa381f6cd1e
[ 2889.899840] x5 : 0000000000000000 x4 : 0000000000000000
[ 2889.899857] x3 : 000000000003d73b x2 : ffffffe3fff05918
[ 2889.899870] x1 : ffffffe3ffefca08 x0 : 0000000000000036
[ 2889.899883] Call trace:
[ 2889.899935]  ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211]
[ 2889.899996]  ieee80211_recalc_ps_vif+0x3c/0x48 [mac80211]
[ 2889.900060]  ieee80211_set_disassoc+0x84/0x43c [mac80211]
[ 2889.900142]  ieee80211_mgd_deauth+0x4e0/0x574 [mac80211]
[ 2889.900201]  ieee80211_deauth+0x24/0x30 [mac80211]
[ 2889.900285]  cfg80211_mlme_deauth+0x198/0x26c [cfg80211]
[ 2889.900337]  cfg80211_mlme_down+0x70/0x98 [cfg80211]
[ 2889.900393]  cfg80211_disconnect+0x214/0x250 [cfg80211]
[ 2889.900441]  __cfg80211_leave+0xc8/0x14c [cfg80211]
[ 2889.900489]  cfg80211_netdev_notifier_call+0x41c/0x668 [cfg80211]
[ 2889.900515]  raw_notifier_call_chain+0x48/0x70
[ 2889.900540]  call_netdevice_notifiers_info+0x40/0x74
[ 2889.900556]  __dev_close_many+0x6c/0x11c
[ 2889.900572]  dev_close_many+0x70/0x100
[ 2889.900587]  dev_close+0x4c/0x80
[ 2889.900636]  cfg80211_shutdown_all_interfaces+0x50/0xcc [cfg80211]
[ 2889.900697]  ieee80211_handle_reconfig_failure+0x90/0x9c [mac80211]
[ 2889.900761]  ieee80211_reconfig+0x18dc/0x19a4 [mac80211]
[ 2889.900815]  ieee80211_restart_work+0xa0/0xd0 [mac80211]
[ 2889.900835]  process_one_work+0x210/0x418
[ 2889.900855]  worker_thread+0x234/0x3dc
[ 2889.900871]  kthread+0x120/0x130
[ 2889.900890]  ret_from_fork+0x10/0x18
[ 2889.900905] ---[ end trace d605ce58e22a44f3 ]---
Justin Capella Jan. 20, 2020, 9:38 a.m. UTC | #4
Ok thanks. I am still trying to familiarize myself with ath10k. A
couple things come to mind:

firstly recently there was a patch that set stations back to
associated during recovery, do you know if the branch you're working
against includes that? I will try to figure that out myself but it's
at least worth considering.

If I remember correctly, the sdio rx involves peeking and checking to
see if the frame/packet continues/has more. In the case of firmware
recovery/reconfig I think current code enables all sorts of debug
pktlog stuff, historically it was maybe used as a hack to check for hw
changes... But I mention it because I think what might be happening in
some cases is fw crash follows a frame that would continue/extend,
which is messing with the content and act_len (maybe?)

I also noticed that with aggregation I wind up getting a
deauth/dissasoc followed by mlme delete sta/peer/key.

I think a retry count could be useful but what do you think of maybe
using an event / timeout?



On Thu, Jan 16, 2020 at 11:19 PM Wen Gong <wgong@codeaurora.org> wrote:
>
> On 2020-01-10 02:29, Wen Gong wrote:
> > On 2020-01-08 20:02, Justin Capella wrote:
> >> I think this might break the "wedged" state.
> >>
> >> Would simply not taking action unless STATE ON avoid the problems with
> >> multiple calls to _restart? ie:
> >>
> >
> > ath10k_core_restart is one part of the recovery process,
> > after ath10k_core_restart, it has other things do in
> > mac80211/ath10k...,
> > it need to make sure all the recovery
> > finished(ath10k_reconfig_complete),
> > then the next recovery can start from ath10k_core_restart.
>
> Here is some log for the 2 recovery run meanwhile and failed.
> It means if 2 recovery run meanwhile, recovery will easy failed.
>
> log steps:
> 1. simulate crash and then firmware crash and enter ath10k_core_restart
> with state=ATH10K_STATE_ON
> 2. it happened "exceeds length, start recovery" and queued the 2nd
> recovery
> 3. mac80211 start to recovery ath10k and state changed from
> ATH10K_STATE_RESTARTING to ATH10K_STATE_RESTARTED
> 4. enter ath10k_core_restart for 2nd recovery and state changed from
> ATH10K_STATE_RESTARTED to ATH10K_STATE_WEDGED
> 5. mac80211 start to drv_add_interface but failed because
> ath10k_htc_send return -ECOMM for ATH10K_STATE_WEDGED
> 6. mac80211 start to ieee80211_handle_reconfig_failure because
> drv_add_interface failed so the recovery failed.
>
>
> kworker/0:1 13360 [000]  2889.292173:            ath10k:ath10k_log_err:
> ath10k_sdio mmc1:0001:1 firmware register dump:
> kworker/u16:3 11486 [001]  2889.292686:
> ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_core_restart =>
> state :1
> kworker/0:1 13360 [000]  2889.292760:           ath10k:ath10k_log_warn:
> ath10k_sdio mmc1:0001:1 payload length 57005 exceeds max htc length:
> 4088
> kworker/0:1 13360 [000]  2889.292805:           ath10k:ath10k_log_warn:
> ath10k_sdio mmc1:0001:1 exceeds length, start recovery
> kworker/1:3 14712 [001]  2889.850142:            ath10k:ath10k_log_dbg:
> ath10k_sdio mmc1:0001:1 wmi tlv init
> kworker/1:3 14712 [001]  2889.866953:            ath10k:ath10k_log_dbg:
> ath10k_sdio mmc1:0001:1 wmi tlv scan prob req oui
> kworker/u16:3 11486 [001]  2889.888801:
> ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 ath10k_core_restart => 2
> state :3
> kworker/u16:3 11486 [001]  2889.888808:
> ath10k:ath10k_log_warn: ath10k_sdio mmc1:0001:1 device is wedged, will
> not restart
> kworker/1:3 14712 [001]  2889.889220:           ath10k:ath10k_log_warn:
> ath10k_sdio mmc1:0001:1 ath10k_add_interface => state :4
> kworker/1:3 14712 [001]  2889.889229:            ath10k:ath10k_log_dbg:
> ath10k_sdio mmc1:0001:1 mac create vdev 0 map f
> kworker/1:3 14712 [001]  2889.889240:            ath10k:ath10k_log_dbg:
> ath10k_sdio mmc1:0001:1 mac vdev create 0 (add interface) type 2 subtype
> 0 bcnmode per-skb
> kworker/1:3 14712 [001]  2889.889247:            ath10k:ath10k_log_dbg:
> ath10k_sdio mmc1:0001:1 wmi tlv vdev create
> kworker/1:3 14712 [001]  2889.889252:            ath10k:ath10k_wmi_cmd:
> ath10k_sdio mmc1:0001:1 id 20481 len 28
> kworker/1:3 14712 [001]  2889.889260:           ath10k:ath10k_log_warn:
> ath10k_sdio mmc1:0001:1 failed to create WMI vdev 0: -70
>
> ieee80211_reconfig will ieee80211_handle_reconfig_failure because
> drv_add_interface failed.
> so the recovery failed.
>
> detail log for fail
> [ 2889.410150] ieee80211 phy0: Hardware restart was requested
> [ 2889.889281] drv_add_interface ret:-70
> [ 2889.889310] ------------[ cut here ]------------
> [ 2889.889547] WARNING: CPU: 1 PID: 14712 at
> /mnt/host/source/src/third_party/kernel/v4.19/net/mac80211/util.c:2072
> ieee80211_reconfig+0x189c/0x19a4 [mac80211]
> [ 2889.889559] Modules linked in: rfcomm uinput cros_ec_rpmsg mtk_dip
> mtk_vcodec_enc mtk_seninf mtk_vcodec_dec mtk_mdp3 mtk_fd mtk_cam_isp
> mtk_vcodec_common videobuf2_dma_contig videobuf2_memops v4l2_mem2mem
> videobuf2_v4l2 videobuf2_common hci_uart btqca bluetooth
> hid_google_hammer ecdh_generic ov8856 dw9768 mtk_scp mtk_rpmsg ov02a10
> rpmsg_core v4l2_fwnode mtk_scp_ipi nf_nat_tftp nf_conntrack_tftp
> nf_nat_ftp nf_conntrack_ftp esp6 ah6 xfrm6_mode_tunnel
> xfrm6_mode_transport xfrm4_mode_tunnel xfrm4_mode_transport ip6t_REJECT
> ip6t_ipv6header ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring
> cros_ec_light_prox cros_ec_sensors cros_ec_sensors_sync
> industrialio_triggered_buffer kfifo_buf cros_ec_sensors_core ath10k_sdio
> ath10k_core ath mac80211 lzo_rle lzo_compress zram cfg80211 ax88179_178a
> usbnet mii
> [ 2889.889689]  joydev
> [ 2889.889709] CPU: 1 PID: 14712 Comm: kworker/1:3 Tainted: G        W
>        4.19.95 #139
> [ 2889.889720] Hardware name: MediaTek krane sku176 board (DT)
> [ 2889.889776] Workqueue: events_freezable ieee80211_restart_work
> [mac80211]
> [ 2889.889792] pstate: 60000005 (nZCv daif -PAN -UAO)
> [ 2889.889849] pc : ieee80211_reconfig+0x189c/0x19a4 [mac80211]
> [ 2889.889908] lr : ieee80211_reconfig+0x189c/0x19a4 [mac80211]
> [ 2889.889918] sp : ffffff8012743cc0
> [ 2889.889927] x29: ffffff8012743d40 x28: ffffffa381e11f40
> [ 2889.889941] x27: ffffffa381ad4018 x26: 0000000000000000
> [ 2889.889954] x25: 00000000ffffffba x24: 0000000000000004
> [ 2889.889968] x23: ffffffe3b52a2210 x22: ffffffe3c3ae12b8
> [ 2889.889981] x21: ffffffe3c3ae12b8 x20: ffffffe3b4e80900
> [ 2889.889995] x19: ffffffe3c3ae0800 x18: 000000000003b900
> [ 2889.890008] x17: 000000000000003c x16: ffffffa3815c1540
> [ 2889.890021] x15: 0000000000000006 x14: ffff001000000600
> [ 2889.890034] x13: 00000000002ae45c x12: 0000000000000000
> [ 2889.890047] x11: 0000000000000001 x10: 0000000000000007
> [ 2889.890061] x9 : c6cfad4ea7c13400 x8 : c6cfad4ea7c13400
> [ 2889.890074] x7 : 0000000000000000 x6 : ffffffa381f6cd0c
> [ 2889.890087] x5 : 0000000000000000 x4 : 0000000000000000
> [ 2889.890100] x3 : 000000000003e2a1 x2 : fffffffffffff704
> [ 2889.890113] x1 : 0000000000000000 x0 : 0000000000000024
> [ 2889.890127] Call trace:
> [ 2889.890185]  ieee80211_reconfig+0x189c/0x19a4 [mac80211]
> [ 2889.890236]  ieee80211_restart_work+0xa0/0xd0 [mac80211]
> [ 2889.890258]  process_one_work+0x210/0x418
> [ 2889.890273]  worker_thread+0x234/0x3dc
> [ 2889.890288]  kthread+0x120/0x130
> [ 2889.890303]  ret_from_fork+0x10/0x18
> [ 2889.890314] ---[ end trace d605ce58e22a44f2 ]---
> [ 2889.898956] wlan0: deauthenticating from c4:04:15:3b:e0:38 by local
> choice (Reason: 3=DEAUTH_LEAVING)
> [ 2889.898995] ------------[ cut here ]------------
> [ 2889.899006] wlan0:  Failed check-sdata-in-driver check, flags: 0x0
> [ 2889.899289] WARNING: CPU: 1 PID: 14712 at
> /mnt/host/source/src/third_party/kernel/v4.19/net/mac80211/driver-ops.h:19
> ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211]
> [ 2889.899303] Modules linked in: rfcomm uinput cros_ec_rpmsg mtk_dip
> mtk_vcodec_enc mtk_seninf mtk_vcodec_dec mtk_mdp3 mtk_fd mtk_cam_isp
> mtk_vcodec_common videobuf2_dma_contig videobuf2_memops v4l2_mem2mem
> videobuf2_v4l2 videobuf2_common hci_uart btqca bluetooth
> hid_google_hammer ecdh_generic ov8856 dw9768 mtk_scp mtk_rpmsg ov02a10
> rpmsg_core v4l2_fwnode mtk_scp_ipi nf_nat_tftp nf_conntrack_tftp
> nf_nat_ftp nf_conntrack_ftp esp6 ah6 xfrm6_mode_tunnel
> xfrm6_mode_transport xfrm4_mode_tunnel xfrm4_mode_transport ip6t_REJECT
> ip6t_ipv6header ipt_MASQUERADE fuse iio_trig_sysfs cros_ec_sensors_ring
> cros_ec_light_prox cros_ec_sensors cros_ec_sensors_sync
> industrialio_triggered_buffer kfifo_buf cros_ec_sensors_core ath10k_sdio
> ath10k_core ath mac80211 lzo_rle lzo_compress zram cfg80211 ax88179_178a
> usbnet mii
> [ 2889.899438]  joydev
> [ 2889.899475] CPU: 1 PID: 14712 Comm: kworker/1:3 Tainted: G        W
>        4.19.95 #139
> [ 2889.899486] Hardware name: MediaTek krane sku176 board (DT)
> [ 2889.899545] Workqueue: events_freezable ieee80211_restart_work
> [mac80211]
> [ 2889.899561] pstate: 60000005 (nZCv daif -PAN -UAO)
> [ 2889.899612] pc : ieee80211_bss_info_change_notify+0x23c/0x24c
> [mac80211]
> [ 2889.899664] lr : ieee80211_bss_info_change_notify+0x23c/0x24c
> [mac80211]
> [ 2889.899674] sp : ffffff8012743820
> [ 2889.899686] x29: ffffff8012743860 x28: ffffffa381e11f40
> [ 2889.899700] x27: ffffffa381ad4018 x26: 0000000000000000
> [ 2889.899712] x25: 0000000000000000 x24: 00000000000000c0
> [ 2889.899725] x23: 0000000000000003 x22: ffffff80127438fc
> [ 2889.899738] x21: ffffffe3b4e80900 x20: 0000000000020000
> [ 2889.899751] x19: ffffffe3c3ae0800 x18: 000000000003c500
> [ 2889.899764] x17: 000000000000003c x16: ffffffa3815c1540
> [ 2889.899776] x15: 0000000000000006 x14: ffff001000000600
> [ 2889.899789] x13: 00000000002ae46e x12: 0000000000000000
> [ 2889.899802] x11: 0000000000000000 x10: 0000000000000000
> [ 2889.899814] x9 : c6cfad4ea7c13400 x8 : c6cfad4ea7c13400
> [ 2889.899827] x7 : 0000000000000000 x6 : ffffffa381f6cd1e
> [ 2889.899840] x5 : 0000000000000000 x4 : 0000000000000000
> [ 2889.899857] x3 : 000000000003d73b x2 : ffffffe3fff05918
> [ 2889.899870] x1 : ffffffe3ffefca08 x0 : 0000000000000036
> [ 2889.899883] Call trace:
> [ 2889.899935]  ieee80211_bss_info_change_notify+0x23c/0x24c [mac80211]
> [ 2889.899996]  ieee80211_recalc_ps_vif+0x3c/0x48 [mac80211]
> [ 2889.900060]  ieee80211_set_disassoc+0x84/0x43c [mac80211]
> [ 2889.900142]  ieee80211_mgd_deauth+0x4e0/0x574 [mac80211]
> [ 2889.900201]  ieee80211_deauth+0x24/0x30 [mac80211]
> [ 2889.900285]  cfg80211_mlme_deauth+0x198/0x26c [cfg80211]
> [ 2889.900337]  cfg80211_mlme_down+0x70/0x98 [cfg80211]
> [ 2889.900393]  cfg80211_disconnect+0x214/0x250 [cfg80211]
> [ 2889.900441]  __cfg80211_leave+0xc8/0x14c [cfg80211]
> [ 2889.900489]  cfg80211_netdev_notifier_call+0x41c/0x668 [cfg80211]
> [ 2889.900515]  raw_notifier_call_chain+0x48/0x70
> [ 2889.900540]  call_netdevice_notifiers_info+0x40/0x74
> [ 2889.900556]  __dev_close_many+0x6c/0x11c
> [ 2889.900572]  dev_close_many+0x70/0x100
> [ 2889.900587]  dev_close+0x4c/0x80
> [ 2889.900636]  cfg80211_shutdown_all_interfaces+0x50/0xcc [cfg80211]
> [ 2889.900697]  ieee80211_handle_reconfig_failure+0x90/0x9c [mac80211]
> [ 2889.900761]  ieee80211_reconfig+0x18dc/0x19a4 [mac80211]
> [ 2889.900815]  ieee80211_restart_work+0xa0/0xd0 [mac80211]
> [ 2889.900835]  process_one_work+0x210/0x418
> [ 2889.900855]  worker_thread+0x234/0x3dc
> [ 2889.900871]  kthread+0x120/0x130
> [ 2889.900890]  ret_from_fork+0x10/0x18
> [ 2889.900905] ---[ end trace d605ce58e22a44f3 ]---
Wen Gong Jan. 20, 2020, 1:34 p.m. UTC | #5
On 2020-01-20 17:38, Justin Capella wrote:
> Ok thanks. I am still trying to familiarize myself with ath10k. A
> couple things come to mind:
> 
> firstly recently there was a patch that set stations back to
> associated during recovery, do you know if the branch you're working
> against includes that? I will try to figure that out myself but it's
> at least worth considering.
> 
can you give the patch link or patch?

> If I remember correctly, the sdio rx involves peeking and checking to
> see if the frame/packet continues/has more. In the case of firmware
> recovery/reconfig I think current code enables all sorts of debug
> pktlog stuff, historically it was maybe used as a hack to check for hw
> changes... But I mention it because I think what might be happening in
> some cases is fw crash follows a frame that would continue/extend,
> which is messing with the content and act_len (maybe?)
does it have relation with this patch?
> 
> I also noticed that with aggregation I wind up getting a
> deauth/dissasoc followed by mlme delete sta/peer/key.
> 
does it have relation with this patch?
> I think a retry count could be useful but what do you think of maybe
> using an event / timeout?
> 
does it have relation with this patch?

> 
>
Justin Capella Jan. 20, 2020, 3:37 p.m. UTC | #6
Sorry, that response was intended for the start retries.

On Mon, Jan 20, 2020 at 5:34 AM Wen Gong <wgong@codeaurora.org> wrote:
>
> On 2020-01-20 17:38, Justin Capella wrote:
> > Ok thanks. I am still trying to familiarize myself with ath10k. A
> > couple things come to mind:
> >
> > firstly recently there was a patch that set stations back to
> > associated during recovery, do you know if the branch you're working
> > against includes that? I will try to figure that out myself but it's
> > at least worth considering.
> >
> can you give the patch link or patch?
>
> > If I remember correctly, the sdio rx involves peeking and checking to
> > see if the frame/packet continues/has more. In the case of firmware
> > recovery/reconfig I think current code enables all sorts of debug
> > pktlog stuff, historically it was maybe used as a hack to check for hw
> > changes... But I mention it because I think what might be happening in
> > some cases is fw crash follows a frame that would continue/extend,
> > which is messing with the content and act_len (maybe?)
> does it have relation with this patch?
> >
> > I also noticed that with aggregation I wind up getting a
> > deauth/dissasoc followed by mlme delete sta/peer/key.
> >
> does it have relation with this patch?
> > I think a retry count could be useful but what do you think of maybe
> > using an event / timeout?
> >
> does it have relation with this patch?
>
> >
> >
Kalle Valo Aug. 14, 2020, 5:19 p.m. UTC | #7
Wen Gong <wgong@codeaurora.org> writes:

> When it has more than one restart_work queued meanwhile, the 2nd
> restart_work is very esay to break the 1st restart work and lead
> recovery fail.
>
> Add a ref count to allow only one restart work running untill
> device successfully recovered.
>
> Tested with QCA6174 SDIO with firmware WLAN.RMH.4.4.1-00029.
>
> Signed-off-by: Wen Gong <wgong@codeaurora.org>
> ---
>  drivers/net/wireless/ath/ath10k/core.c | 13 +++++++++++++
>  drivers/net/wireless/ath/ath10k/core.h |  2 ++
>  drivers/net/wireless/ath/ath10k/mac.c  |  1 +
>  3 files changed, 16 insertions(+)
>
> diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
> index 91f131b87efc..0e31846e6c89 100644
> --- a/drivers/net/wireless/ath/ath10k/core.c
> +++ b/drivers/net/wireless/ath/ath10k/core.c
> @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct work_struct *work)
>  {
>  	struct ath10k *ar = container_of(work, struct ath10k, restart_work);
>  	int ret;
> +	int restart_count;
> +
> +	restart_count = atomic_add_return(1, &ar->restart_count);
> +	if (restart_count > 1) {
> +		ath10k_warn(ar, "can not restart, count: %d\n", restart_count);
> +		atomic_dec(&ar->restart_count);
> +		return;
> +	}

I have been thinking a different approach for this. I think another
option is to have a function like this:

ath10k_core_firmware_crashed()
{
        queue_work(ar->workqueue, &ar->restart_work);
}

In patch 1 we would convert all existing callers to call that
function instead of queue_work() directly.

In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe
should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet
which one is better. Now the function would do:

ath10k_core_firmware_crashed()
{
        if (test_bit(flag))
                return

        set_bit(flag)                                
	queue_work(ar->workqueue, &ar->restart_work);
}

That way restart_work queue would be called only one time.

Though I'm not sure how ATH10K_STATE_WEDGED would behave after this
change, it might get broken. Ah, actually I think even this patch breaks
the WEDGED state. This firmware restart is tricky, difficult to say what
is the best approach. Michal, are you reading? :) Any ideas?

And after looking more about this patch I don't see the need for the new
ar->restart_count atomic variable. Checking for ATH10K_FLAG_CRASH_FLUSH
would do the same thing AFAICS.

And related to this, (in a separate patch) I think we should utilise
ATH10K_FLAG_CRASH_FLUSH more. For example in ath10k_wmi_cmd_send() to
not even try to send a WMI command if the flag is set. Basically all
hardware access should be disabled except what is needed to restart the
firmware.
Wen Gong Aug. 18, 2020, 8:39 a.m. UTC | #8
On 2020-08-15 01:19, Kalle Valo wrote:
> Wen Gong <wgong@codeaurora.org> writes:
> 
...
>> diff --git a/drivers/net/wireless/ath/ath10k/core.c 
>> b/drivers/net/wireless/ath/ath10k/core.c
>> index 91f131b87efc..0e31846e6c89 100644
>> --- a/drivers/net/wireless/ath/ath10k/core.c
>> +++ b/drivers/net/wireless/ath/ath10k/core.c
>> @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct 
>> work_struct *work)
>>  {
>>  	struct ath10k *ar = container_of(work, struct ath10k, restart_work);
>>  	int ret;
>> +	int restart_count;
>> +
>> +	restart_count = atomic_add_return(1, &ar->restart_count);
>> +	if (restart_count > 1) {
>> +		ath10k_warn(ar, "can not restart, count: %d\n", restart_count);
>> +		atomic_dec(&ar->restart_count);
>> +		return;
>> +	}
> 
> I have been thinking a different approach for this. I think another
> option is to have a function like this:
> 
> ath10k_core_firmware_crashed()
> {
>         queue_work(ar->workqueue, &ar->restart_work);
> }
> 
> In patch 1 we would convert all existing callers to call that
> function instead of queue_work() directly.
> 
> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe
> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet
> which one is better. Now the function would do:
> 
> ath10k_core_firmware_crashed()
> {
>         if (test_bit(flag))
>                 return
> 
>         set_bit(flag)
> 	queue_work(ar->workqueue, &ar->restart_work);
> }
> 
> That way restart_work queue would be called only one time.
> 
This is not muti-thread-safe, for example, if 2 thread entered to the 
test_bit(flag) meanwhile
and both check pass, then it will have 2 restart.

atomic_add_return is muti-thread-safe, if 2 thread entered it, only 1 
thread can pass
the check, another will fail and return.

The "payload length exceeds max htc length for sdio" happened many times 
in a very short time,
so I add this check for it.

> Though I'm not sure how ATH10K_STATE_WEDGED would behave after this
> change, it might get broken. Ah, actually I think even this patch 
> breaks
> the WEDGED state. This firmware restart is tricky, difficult to say 
> what
> is the best approach. Michal, are you reading? :) Any ideas?
> 
> And after looking more about this patch I don't see the need for the 
> new
> ar->restart_count atomic variable. Checking for ATH10K_FLAG_CRASH_FLUSH
> would do the same thing AFAICS.
> 
> And related to this, (in a separate patch) I think we should utilise
> ATH10K_FLAG_CRASH_FLUSH more. For example in ath10k_wmi_cmd_send() to
> not even try to send a WMI command if the flag is set. Basically all
> hardware access should be disabled except what is needed to restart the
> firmware.
Wen Gong Aug. 19, 2020, 12:01 p.m. UTC | #9
On 2020-08-15 01:19, Kalle Valo wrote:
...
> 
> I have been thinking a different approach for this. I think another
> option is to have a function like this:
> 
> ath10k_core_firmware_crashed()
> {
>         queue_work(ar->workqueue, &ar->restart_work);
> }
> 
> In patch 1 we would convert all existing callers to call that
> function instead of queue_work() directly.
> 
> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe
> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet
> which one is better. Now the function would do:
> 
> ath10k_core_firmware_crashed()
> {
>         if (test_bit(flag))
>                 return
> 
>         set_bit(flag)
> 	queue_work(ar->workqueue, &ar->restart_work);
> }
> 
It is better to clear_bit ATH10K_FLAG_CRASH_FLUSH/new flag in 
ath10k_reconfig_complete
instead of ath10k_core_start because ieee80211_reconfig(called by 
ieee80211_restart_work)
of mac80211 do many things and drv_start is 1st thing and 
drv_reconfig_complete is last thing.

> That way restart_work queue would be called only one time.
> 
> Though I'm not sure how ATH10K_STATE_WEDGED would behave after this
> change, it might get broken. Ah, actually I think even this patch 
> breaks
> the WEDGED state. This firmware restart is tricky, difficult to say 
> what
> is the best approach. Michal, are you reading? :) Any ideas?
> 
> And after looking more about this patch I don't see the need for the 
> new
> ar->restart_count atomic variable. Checking for ATH10K_FLAG_CRASH_FLUSH
> would do the same thing AFAICS.
> 
> And related to this, (in a separate patch) I think we should utilise
> ATH10K_FLAG_CRASH_FLUSH more. For example in ath10k_wmi_cmd_send() to
> not even try to send a WMI command if the flag is set. Basically all
> hardware access should be disabled except what is needed to restart the
> firmware.
Wen Gong Aug. 20, 2020, 9:18 a.m. UTC | #10
On 2020-08-15 01:19, Kalle Valo wrote:
...
> 
> I have been thinking a different approach for this. I think another
> option is to have a function like this:
> 
> ath10k_core_firmware_crashed()
> {
>         queue_work(ar->workqueue, &ar->restart_work);
> }
> 
> In patch 1 we would convert all existing callers to call that
> function instead of queue_work() directly.
> 
> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe
> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet
> which one is better. Now the function would do:
I thinks we can use test_and_set_bit for atomic operation athough it is 
same with restart_count.
and add a new flag, ATH10K_FLAG_CRASH_FLUSH is used for flush,
if still use ATH10K_FLAG_CRASH_FLUSH, it should change clear_bit of it 
from
ath10k_core_start to ath10k_reconfig_complete,because 
ieee80211_reconfig(called by
ieee80211_restart_work)
of mac80211 do many things and drv_start is 1st thing and
drv_reconfig_complete is last thing, drv_reconfig_complete done means 
the restart
finished.

I will send patch v5 with above changes if not other advise.
> 
> ath10k_core_firmware_crashed()
> {
>         if (test_bit(flag))
>                 return
> 
>         set_bit(flag)
> 	queue_work(ar->workqueue, &ar->restart_work);
> }
> 
> That way restart_work queue would be called only one time.
> 
> Though I'm not sure how ATH10K_STATE_WEDGED would behave after this
> change, it might get broken. Ah, actually I think even this patch 
> breaks
> the WEDGED state. This firmware restart is tricky, difficult to say 
> what
> is the best approach. Michal, are you reading? :) Any ideas?
> 
> And after looking more about this patch I don't see the need for the 
> new
> ar->restart_count atomic variable. Checking for ATH10K_FLAG_CRASH_FLUSH
> would do the same thing AFAICS.
> 
> And related to this, (in a separate patch) I think we should utilise
> ATH10K_FLAG_CRASH_FLUSH more. For example in ath10k_wmi_cmd_send() to
> not even try to send a WMI command if the flag is set. Basically all
> hardware access should be disabled except what is needed to restart the
> firmware.
Wen Gong Aug. 24, 2020, 4:36 a.m. UTC | #11
On 2020-08-20 17:18, Wen Gong wrote:
...
> I thinks we can use test_and_set_bit for atomic operation athough it
> is same with restart_count.
> and add a new flag, ATH10K_FLAG_CRASH_FLUSH is used for flush,
> if still use ATH10K_FLAG_CRASH_FLUSH, it should change clear_bit of it 
> from
> ath10k_core_start to ath10k_reconfig_complete,because
> ieee80211_reconfig(called by
> ieee80211_restart_work)
> of mac80211 do many things and drv_start is 1st thing and
> drv_reconfig_complete is last thing, drv_reconfig_complete done means
> the restart
> finished.
> 
> I will send patch v5 with above changes if not other advise.
v5 sent: https://patchwork.kernel.org/patch/11728101/
...
Kalle Valo Sept. 7, 2020, 3:52 p.m. UTC | #12
Wen Gong <wgong@codeaurora.org> writes:

> On 2020-08-15 01:19, Kalle Valo wrote:
>> Wen Gong <wgong@codeaurora.org> writes:
>>
> ...
>>> diff --git a/drivers/net/wireless/ath/ath10k/core.c
>>> b/drivers/net/wireless/ath/ath10k/core.c
>>> index 91f131b87efc..0e31846e6c89 100644
>>> --- a/drivers/net/wireless/ath/ath10k/core.c
>>> +++ b/drivers/net/wireless/ath/ath10k/core.c
>>> @@ -2199,6 +2199,14 @@ static void ath10k_core_restart(struct
>>> work_struct *work)
>>>  {
>>>  	struct ath10k *ar = container_of(work, struct ath10k, restart_work);
>>>  	int ret;
>>> +	int restart_count;
>>> +
>>> +	restart_count = atomic_add_return(1, &ar->restart_count);
>>> +	if (restart_count > 1) {
>>> +		ath10k_warn(ar, "can not restart, count: %d\n", restart_count);
>>> +		atomic_dec(&ar->restart_count);
>>> +		return;
>>> +	}
>>
>> I have been thinking a different approach for this. I think another
>> option is to have a function like this:
>>
>> ath10k_core_firmware_crashed()
>> {
>>         queue_work(ar->workqueue, &ar->restart_work);
>> }
>>
>> In patch 1 we would convert all existing callers to call that
>> function instead of queue_work() directly.
>>
>> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe
>> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet
>> which one is better. Now the function would do:
>>
>> ath10k_core_firmware_crashed()
>> {
>>         if (test_bit(flag))
>>                 return
>>
>>         set_bit(flag)
>> 	queue_work(ar->workqueue, &ar->restart_work);
>> }
>>
>> That way restart_work queue would be called only one time.
>
> This is not muti-thread-safe, for example, if 2 thread entered to the
> test_bit(flag) meanwhile and both check pass, then it will have 2
> restart.

Good point, this was racy. And I see that you found test_and_set_bit()
already to fix the race.

> atomic_add_return is muti-thread-safe, if 2 thread entered it, only 1
> thread can pass the check, another will fail and return.

I'm not going to add new state variables unless the justification is
REALLY strong and sound. This firmware restart is complicated as is
already, there's no reason to complicate it even more.
Kalle Valo Sept. 7, 2020, 3:55 p.m. UTC | #13
Wen Gong <wgong@codeaurora.org> writes:

> On 2020-08-15 01:19, Kalle Valo wrote:
> ...
>>
>> I have been thinking a different approach for this. I think another
>> option is to have a function like this:
>>
>> ath10k_core_firmware_crashed()
>> {
>>         queue_work(ar->workqueue, &ar->restart_work);
>> }
>>
>> In patch 1 we would convert all existing callers to call that
>> function instead of queue_work() directly.
>>
>> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe
>> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet
>> which one is better. Now the function would do:
>
> I thinks we can use test_and_set_bit for atomic operation athough it
> is same with restart_count.

I didn't quite understand what you mean here, but using
test_and_set_bit() is the correct methdo.

> and add a new flag, ATH10K_FLAG_CRASH_FLUSH is used for flush, if
> still use ATH10K_FLAG_CRASH_FLUSH, it should change clear_bit of it
> from ath10k_core_start to ath10k_reconfig_complete,because
> ieee80211_reconfig(called by ieee80211_restart_work) of mac80211 do
> many things and drv_start is 1st thing and drv_reconfig_complete is
> last thing, drv_reconfig_complete done means the restart finished.

Ok, let's discuss about that in v5. I hope you added the analysis to the
commit log.
Wen Gong Sept. 8, 2020, 3:47 a.m. UTC | #14
On 2020-09-07 23:55, Kalle Valo wrote:
> Wen Gong <wgong@codeaurora.org> writes:
> 
>> On 2020-08-15 01:19, Kalle Valo wrote:
>> ...
>>> 
>>> I have been thinking a different approach for this. I think another
>>> option is to have a function like this:
>>> 
>>> ath10k_core_firmware_crashed()
>>> {
>>>         queue_work(ar->workqueue, &ar->restart_work);
>>> }
>>> 
>>> In patch 1 we would convert all existing callers to call that
>>> function instead of queue_work() directly.
>>> 
>>> In patch 2 we would add a new flag to enum ath10k_dev_flags, or maybe
>>> should actually use existing ATH10K_FLAG_CRASH_FLUSH? Don't know yet
>>> which one is better. Now the function would do:
>> 
>> I thinks we can use test_and_set_bit for atomic operation athough it
>> is same with restart_count.
> 
> I didn't quite understand what you mean here, but using
> test_and_set_bit() is the correct methdo.
> 
>> and add a new flag, ATH10K_FLAG_CRASH_FLUSH is used for flush, if
>> still use ATH10K_FLAG_CRASH_FLUSH, it should change clear_bit of it
>> from ath10k_core_start to ath10k_reconfig_complete,because
>> ieee80211_reconfig(called by ieee80211_restart_work) of mac80211 do
>> many things and drv_start is 1st thing and drv_reconfig_complete is
>> last thing, drv_reconfig_complete done means the restart finished.
> 
> Ok, let's discuss about that in v5. I hope you added the analysis to 
> the
> commit log.
yes, v5 have alread sent in https://patchwork.kernel.org/patch/11728101/
[v5] ath10k: add atomic protection for device recovery
diff mbox series

Patch

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 91f131b87efc..0e31846e6c89 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -2199,6 +2199,14 @@  static void ath10k_core_restart(struct work_struct *work)
 {
 	struct ath10k *ar = container_of(work, struct ath10k, restart_work);
 	int ret;
+	int restart_count;
+
+	restart_count = atomic_add_return(1, &ar->restart_count);
+	if (restart_count > 1) {
+		ath10k_warn(ar, "can not restart, count: %d\n", restart_count);
+		atomic_dec(&ar->restart_count);
+		return;
+	}
 
 	set_bit(ATH10K_FLAG_CRASH_FLUSH, &ar->dev_flags);
 
@@ -2231,6 +2239,11 @@  static void ath10k_core_restart(struct work_struct *work)
 
 	mutex_lock(&ar->conf_mutex);
 
+	if (ar->state != ATH10K_STATE_ON) {
+		ath10k_warn(ar, "state is not on: %d\n", ar->state);
+		atomic_dec(&ar->restart_count);
+	}
+
 	switch (ar->state) {
 	case ATH10K_STATE_ON:
 		ar->state = ATH10K_STATE_RESTARTING;
diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index e57b2e7235e3..810c99f2dc0e 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -982,6 +982,8 @@  struct ath10k {
 	/* protected by conf_mutex */
 	u8 ps_state_enable;
 
+	atomic_t restart_count;
+
 	bool nlo_enabled;
 	bool p2p;
 
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 3856edba7915..bc1574145e66 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -7208,6 +7208,7 @@  static void ath10k_reconfig_complete(struct ieee80211_hw *hw,
 		ath10k_info(ar, "device successfully recovered\n");
 		ar->state = ATH10K_STATE_ON;
 		ieee80211_wake_queues(ar->hw);
+		atomic_dec(&ar->restart_count);
 	}
 
 	mutex_unlock(&ar->conf_mutex);