diff mbox series

[v3] mac80211: fix regression in sta connection monitor

Message ID 20200927105605.97954-1-nbd@nbd.name (mailing list archive)
State Accepted
Delegated to: Johannes Berg
Headers show
Series [v3] mac80211: fix regression in sta connection monitor | expand

Commit Message

Felix Fietkau Sept. 27, 2020, 10:56 a.m. UTC
When a frame was acked and probe frames were sent, the connection monitoring
needs to be reset, otherwise it will keep probing until the connection is
considered dead, even though frames have been acked in the mean time.

Fixes: 9abf4e49830d ("mac80211: optimize station connection monitor")
Reported-by: Georgi Valkov <gvalkov@abv.bg>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
v3: only queue work on nullfunc frames status
v2: reset connection monitor when a frame was acked (not just for nulldata)

 net/mac80211/mlme.c   |  4 +++-
 net/mac80211/status.c | 16 ++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

Comments

Georgi Valkov Sept. 27, 2020, 3:10 p.m. UTC | #1
Hi Felix!

Here are some interesting tracing results based on v2. My tracing code is below.
The only printed output comes from this line, so it looks like the other places were I added tracing were never executed.
[11428.907823] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11459.636346] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11490.367121] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11523.141354] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11555.917399] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11586.644909] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11617.375977] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11650.147750] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11682.923378] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11713.649688] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11744.376130] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[11777.150797] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0

I just started testing v3, and so far it looks like this, but we need to wait and see if there will be any disconnect events:
I wonder why we experience an increase in probe_send_count every 130 seconds?
[  105.489078] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  136.204809] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  166.924520] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  199.690445] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  230.401303] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  261.110496] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  293.869304] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  324.580664] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  355.291914] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  386.002786] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  418.762544] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  449.473091] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  480.186644] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  512.941192] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[  545.700283] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  576.409210] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  607.119928] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  639.877866] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  670.588159] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[  701.298701] wifi: --- ack  probe_send_count: 1  vif.type 2:2


// patch.v2 + tracing
Index: backports-5.8-1/net/mac80211/status.c
===================================================================
--- backports-5.8-1.orig/net/mac80211/status.c
+++ backports-5.8-1/net/mac80211/status.c
@@ -1129,6 +1129,8 @@ void ieee80211_tx_status_ext(struct ieee
 	noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
 
 	if (pubsta) {
+		struct ieee80211_sub_if_data *sdata = sta->sdata;
+
 		if (!acked && !noack_success)
 			sta->status_stats.retry_failed++;
 		sta->status_stats.retry_count += retry_count;
@@ -1143,6 +1145,31 @@ void ieee80211_tx_status_ext(struct ieee
 				/* Track when last packet was ACKed */
 				sta->status_stats.last_pkt_time = jiffies;
 
+				/* Reset connection monitor */
+				if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+				    unlikely(sdata->u.mgd.probe_send_count > 0)) {
+					struct ieee80211_hdr * hdr = (struct ieee80211_hdr *)sdata;
+
+					sdata_info(
+						sdata,
+						"reset connection monitor  probe_send_count: %u  nullfunc: %u\n",
+						sdata->u.mgd.probe_send_count,
+						ieee80211_is_any_nullfunc(hdr->frame_control)
+					);
+
+					sdata->u.mgd.probe_send_count = 0;
+					ieee80211_queue_work(&local->hw, &sdata->work);
+				}
+
+				if (sdata->u.mgd.probe_send_count > 0)
+				{
+					sdata_info(
+						sdata,
+						"!!!  probe_send_count: %u\n",
+						sdata->u.mgd.probe_send_count
+					);
+				}
+
 				if (info->status.is_valid_ack_signal) {
 					sta->status_stats.last_ack_signal =
 							 (s8)info->status.ack_signal;
Index: backports-5.8-1/net/mac80211/mlme.c
===================================================================
--- backports-5.8-1.orig/net/mac80211/mlme.c
+++ backports-5.8-1/net/mac80211/mlme.c
@@ -2505,13 +2505,46 @@ void ieee80211_sta_tx_notify(struct ieee
 	ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time);
 
 	if (!ieee80211_is_any_nullfunc(hdr->frame_control) ||
-	    !sdata->u.mgd.probe_send_count)
+		!sdata->u.mgd.probe_send_count)
+	{
+		if (ack && sdata->u.mgd.probe_send_count)
+		{
+			sdata_info(
+				sdata,
+				"+++ ack  probe_send_count: %u  nullfunc_failed %u  vif.type %u:%u\n",
+				sdata->u.mgd.probe_send_count,
+				sdata->u.mgd.nullfunc_failed,
+				sdata->vif.type,
+				NL80211_IFTYPE_STATION
+			);
+		}
+
 		return;
+	}
 
 	if (ack)
+	{
+		sdata_info(
+			sdata,
+			"--- ack  probe_send_count: %u  vif.type %u:%u\n",
+			sdata->u.mgd.probe_send_count,
+			sdata->vif.type,
+			NL80211_IFTYPE_STATION
+		);
 		sdata->u.mgd.probe_send_count = 0;
+	}
 	else
+	{
+		sdata_info(
+			sdata,
+			"probe_send_count: %u  vif.type %u:%u\n",
+			sdata->u.mgd.probe_send_count,
+			sdata->vif.type,
+			NL80211_IFTYPE_STATION
+		);
 		sdata->u.mgd.nullfunc_failed = true;
+	}
+
 	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
 }
 


// 333-trace.patch for v3
Index: backports-5.8-1/net/mac80211/mlme.c
===================================================================
--- backports-5.8-1.orig/net/mac80211/mlme.c
+++ backports-5.8-1/net/mac80211/mlme.c
@@ -2505,13 +2505,46 @@ void ieee80211_sta_tx_notify(struct ieee
 	ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time);
 
 	if (!ieee80211_is_any_nullfunc(hdr->frame_control) ||
-	    !sdata->u.mgd.probe_send_count)
+		!sdata->u.mgd.probe_send_count)
+	{
+		if (ack && sdata->u.mgd.probe_send_count)
+		{
+			sdata_info(
+				sdata,
+				"+++ ack  probe_send_count: %u  nullfunc_failed %u  vif.type %u:%u\n",
+				sdata->u.mgd.probe_send_count,
+				sdata->u.mgd.nullfunc_failed,
+				sdata->vif.type,
+				NL80211_IFTYPE_STATION
+			);
+		}
+
 		return;
+	}
 
 	if (ack)
+	{
+		sdata_info(
+			sdata,
+			"--- ack  probe_send_count: %u  vif.type %u:%u\n",
+			sdata->u.mgd.probe_send_count,
+			sdata->vif.type,
+			NL80211_IFTYPE_STATION
+		);
 		sdata->u.mgd.probe_send_count = 0;
+	}
 	else
+	{
+		sdata_info(
+			sdata,
+			"probe_send_count: %u  vif.type %u:%u\n",
+			sdata->u.mgd.probe_send_count,
+			sdata->vif.type,
+			NL80211_IFTYPE_STATION
+		);
 		sdata->u.mgd.nullfunc_failed = true;
+	}
+
 	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
 }
 
Index: backports-5.8-1/net/mac80211/status.c
===================================================================
--- backports-5.8-1.orig/net/mac80211/status.c
+++ backports-5.8-1/net/mac80211/status.c
@@ -1148,8 +1148,28 @@ void ieee80211_tx_status_ext(struct ieee
 
 				/* Reset connection monitor */
 				if (sdata->vif.type == NL80211_IFTYPE_STATION &&
-				    unlikely(sdata->u.mgd.probe_send_count > 0))
+					unlikely(sdata->u.mgd.probe_send_count > 0))
+				{
+					struct ieee80211_hdr * hdr = (struct ieee80211_hdr *)sdata;
+
+					sdata_info(
+						sdata,
+						"reset connection monitor  probe_send_count: %u  nullfunc: %u\n",
+						sdata->u.mgd.probe_send_count,
+						ieee80211_is_any_nullfunc(hdr->frame_control)
+					);
+
 					sdata->u.mgd.probe_send_count = 0;
+				}
+
+				if (sdata->u.mgd.probe_send_count > 0)
+				{
+					sdata_info(
+						sdata,
+						"!!!  probe_send_count: %u\n",
+						sdata->u.mgd.probe_send_count
+					);
+				}
 
 				if (info->status.is_valid_ack_signal) {
 					sta->status_stats.last_ack_signal =



> On 2020-09-27, at 1:56 PM, Felix Fietkau <nbd@nbd.name> wrote:
> 
> When a frame was acked and probe frames were sent, the connection monitoring
> needs to be reset, otherwise it will keep probing until the connection is
> considered dead, even though frames have been acked in the mean time.
> 
> Fixes: 9abf4e49830d ("mac80211: optimize station connection monitor")
> Reported-by: Georgi Valkov <gvalkov@abv.bg>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
> v3: only queue work on nullfunc frames status
> v2: reset connection monitor when a frame was acked (not just for nulldata)
> 
> net/mac80211/mlme.c   |  4 +++-
> net/mac80211/status.c | 16 ++++++++++++----
> 2 files changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
> index 50a9b9025725..7c04d8e30482 100644
> --- a/net/mac80211/mlme.c
> +++ b/net/mac80211/mlme.c
> @@ -2508,7 +2508,9 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
> 	    !sdata->u.mgd.probe_send_count)
> 		return;
> 
> -	if (!ack)
> +	if (ack)
> +		sdata->u.mgd.probe_send_count = 0;
> +	else
> 		sdata->u.mgd.nullfunc_failed = true;
> 	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
> }
> diff --git a/net/mac80211/status.c b/net/mac80211/status.c
> index 7fe5bececfd9..88a736f3c413 100644
> --- a/net/mac80211/status.c
> +++ b/net/mac80211/status.c
> @@ -982,10 +982,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
> 		if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
> 			ieee80211_frame_acked(sta, skb);
> 
> -		if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) &&
> -		    ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
> -			ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data,
> -						acked, info->status.tx_time);
> 	}
> 
> 	/* SNMP counters
> @@ -1120,11 +1116,18 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
> 	noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
> 
> 	if (pubsta) {
> +		struct ieee80211_sub_if_data *sdata = sta->sdata;
> +
> 		if (!acked && !noack_success)
> 			sta->status_stats.retry_failed++;
> 		sta->status_stats.retry_count += retry_count;
> 
> 		if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
> +			if (sdata->vif.type == NL80211_IFTYPE_STATION &&
> +			    skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP))
> +				ieee80211_sta_tx_notify(sdata, (void *) skb->data,
> +							acked, info->status.tx_time);
> +
> 			if (acked) {
> 				sta->status_stats.last_ack = jiffies;
> 
> @@ -1134,6 +1137,11 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
> 				/* Track when last packet was ACKed */
> 				sta->status_stats.last_pkt_time = jiffies;
> 
> +				/* Reset connection monitor */
> +				if (sdata->vif.type == NL80211_IFTYPE_STATION &&
> +				    unlikely(sdata->u.mgd.probe_send_count > 0))
> +					sdata->u.mgd.probe_send_count = 0;
> +
> 				if (info->status.is_valid_ack_signal) {
> 					sta->status_stats.last_ack_signal =
> 							 (s8)info->status.ack_signal;
> -- 
> 2.28.0
> 
>
Georgi Valkov Sept. 28, 2020, 5:09 p.m. UTC | #2
Hi Felix!

Unfortunately it disconnected again after 21 hours and 44 minutes of uptime, and remained offline for 5 minutes again.
I will add some code to ieee80211_sta_work() to trace more info when a disconnect occurs.
Do you have any ideas? Why does it remain offline for exactly 5 minutes every time? Do we need to reset probe_send_count in rx.c? Do we have to send probe requests or nullfunc while there is active communication, e.g. acknowledged ICMP traffic?

This is the most common event in the trace, it occurs every 30 seconds, we called ieee80211_sta_tx_notify(ack: true), and ieee80211_is_any_nullfunc() returned true, so probe_send_count is reset:
--- ack  probe_send_count: 1  vif.type 2:2

These events are rare, I could not find any pattern. probe_send_count is reset inside status.c
reset connection monitor  probe_send_count: 1  nullfunc: 0

These events are rare and appear randomly. The first indicates that we called ieee80211_sta_tx_notify(ack: false), and ieee80211_is_any_nullfunc() returned true, so probe_send_count is not reset, while the second always appears 10ms later with ack = true, and ieee80211_is_any_nullfunc() returned true, so probe_send_count is reset.
probe_send_count: 1  vif.type 2:2
--- ack  probe_send_count: 2  vif.type 2:2

Here are some interesting parts from the trace:

[STA kernel log]
[55045.254528] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[55075.971445] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[55106.699439] wifi: --- ack  probe_send_count: 1  vif.type 2:2
...
[57812.177775] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[57842.894483] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
[57873.610852] wifi: --- ack  probe_send_count: 1  vif.type 2:2
...
[78172.681502] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78203.398703] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78234.118347] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78264.834126] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78282.530133] wifi: deauthenticated from 60:38:e0:xx:xx:xx (Reason: 16=GROUP_KEY_HANDSHAKE_TIMEOUT)
[78282.539463] br-lan: port 3(wifi) entered disabled state
[78282.544987] br-lan: topology change detected, propagating
[78283.514078] wifi: authenticate with 60:38:e0:xx:xx:xx
[78283.533405] wifi: send auth to 60:38:e0:xx:xx:xx (try 1/3)
[78283.577391] wifi: authenticated
[78283.583874] wifi: associate with 60:38:e0:xx:xx:xx (try 1/3)
[78283.610166] wifi: RX AssocResp from 60:38:e0:xx:xx:xx (capab=0x431 status=0 aid=1)
[78283.618322] wifi: associated
[78283.628908] br-lan: port 3(wifi) entered blocking state
[78283.634358] br-lan: port 3(wifi) entered listening state
[78285.663507] br-lan: port 3(wifi) entered learning state
[78287.679338] br-lan: port 3(wifi) entered forwarding state
[78287.684939] br-lan: topology change detected, sending tcn bpdu
[78313.981348] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78316.028959] br-lan: port 3(wifi) neighbor 7fff.60:38:e0:xx:xx:xy lost
[78316.035630] br-lan: topology change detected, propagating
[78346.746657] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78377.464074] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78408.181527] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78438.898977] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78471.664287] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78502.381794] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78533.099298] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78563.816824] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78583.518337] wifi: disassociated from 60:38:e0:xx:xx:xx (Reason: 4=DISASSOC_DUE_TO_INACTIVITY)
[78583.527312] br-lan: port 3(wifi) entered disabled state
[78583.584070] wifi: authenticate with 60:38:e0:xx:xx:xx
[78583.603408] wifi: send auth to 60:38:e0:xx:xx:xx (try 1/3)
[78583.610306] wifi: authenticated
[78583.615079] wifi: associate with 60:38:e0:xx:xx:xx (try 1/3)
[78583.642902] wifi: RX AssocResp from 60:38:e0:xx:xx:xx (capab=0x431 status=0 aid=1)
[78583.651100] wifi: associated
[78583.662922] br-lan: port 3(wifi) entered blocking state
[78583.668372] br-lan: port 3(wifi) entered listening state
[78585.702846] br-lan: port 3(wifi) entered learning state
[78587.718684] br-lan: port 3(wifi) entered forwarding state
[78587.724275] br-lan: topology change detected, sending tcn bpdu
[78615.014679] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78647.778120] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78680.543528] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78711.261101] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78741.978703] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78774.744145] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78807.510701] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78838.229555] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[78868.945609] wifi: --- ack  probe_send_count: 1  vif.type 2:2
...
[85414.017928] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85444.736816] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85475.456436] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85508.224997] wifi: probe_send_count: 1  vif.type 2:2
[85508.230895] wifi: --- ack  probe_send_count: 2  vif.type 2:2
[85538.941413] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85569.662128] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85600.385660] wifi: probe_send_count: 1  vif.type 2:2
[85600.391453] wifi: --- ack  probe_send_count: 2  vif.type 2:2
[85633.146166] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85663.864942] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85694.583857] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85727.350693] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85758.069606] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85788.795989] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85819.507449] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85852.274294] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85882.993213] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85913.713628] wifi: probe_send_count: 1  vif.type 2:2
[85913.719119] wifi: --- ack  probe_send_count: 2  vif.type 2:2
[85944.431087] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[85977.197935] wifi: --- ack  probe_send_count: 1  vif.type 2:2

[STA system log]
Mon Sep 28 15:24:37 2020 kern.info kernel: [78109.201208] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:25:08 2020 kern.info kernel: [78139.916198] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:25:41 2020 kern.info kernel: [78172.681502] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:26:12 2020 kern.info kernel: [78203.398703] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:26:42 2020 kern.info kernel: [78234.118347] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:27:13 2020 kern.info kernel: [78264.834126] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:27:27 2020 daemon.notice wpa_supplicant[1436]: wifi: WPA: Group rekeying completed with 60:38:e0:xx:xx:xx [GTK=CCMP]
Mon Sep 28 15:27:31 2020 kern.info kernel: [78282.530133] wifi: deauthenticated from 60:38:e0:xx:xx:xx (Reason: 16=GROUP_KEY_HANDSHAKE_TIMEOUT)
Mon Sep 28 15:27:31 2020 kern.info kernel: [78282.539463] br-lan: port 3(wifi) entered disabled state
Mon Sep 28 15:27:31 2020 kern.info kernel: [78282.544987] br-lan: topology change detected, propagating
Mon Sep 28 15:27:31 2020 daemon.notice netifd: Network device 'wifi' link is down
Mon Sep 28 15:27:31 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-DISCONNECTED bssid=60:38:e0:xx:xx:xx reason=16
Mon Sep 28 15:27:32 2020 daemon.notice wpa_supplicant[1436]: wifi: SME: Trying to authenticate with 60:38:e0:xx:xx:xx (SSID='wifi' freq=2432 MHz)
Mon Sep 28 15:27:32 2020 kern.info kernel: [78283.514078] wifi: authenticate with 60:38:e0:xx:xx:xx
Mon Sep 28 15:27:32 2020 kern.info kernel: [78283.533405] wifi: send auth to 60:38:e0:xx:xx:xx (try 1/3)
Mon Sep 28 15:27:32 2020 daemon.notice wpa_supplicant[1436]: wifi: Trying to associate with 60:38:e0:xx:xx:xx (SSID='wifi' freq=2432 MHz)
Mon Sep 28 15:27:32 2020 kern.info kernel: [78283.577391] wifi: authenticated
Mon Sep 28 15:27:32 2020 kern.info kernel: [78283.583874] wifi: associate with 60:38:e0:xx:xx:xx (try 1/3)
Mon Sep 28 15:27:32 2020 kern.info kernel: [78283.610166] wifi: RX AssocResp from 60:38:e0:xx:xx:xx (capab=0x431 status=0 aid=1)
Mon Sep 28 15:27:32 2020 kern.info kernel: [78283.618322] wifi: associated
Mon Sep 28 15:27:32 2020 daemon.notice wpa_supplicant[1436]: wifi: Associated with 60:38:e0:xx:xx:xx
Mon Sep 28 15:27:32 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-SUBNET-STATUS-UPDATE status=0
Mon Sep 28 15:27:32 2020 daemon.notice netifd: Network device 'wifi' link is up
Mon Sep 28 15:27:32 2020 daemon.notice wpa_supplicant[1436]: wifi: WPA: Key negotiation completed with 60:38:e0:xx:xx:xx [PTK=CCMP GTK=CCMP]
Mon Sep 28 15:27:32 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-CONNECTED - Connection to 60:38:e0:xx:xx:xx completed [id=0 id_str=]
Mon Sep 28 15:27:32 2020 kern.info kernel: [78283.628908] br-lan: port 3(wifi) entered blocking state
Mon Sep 28 15:27:32 2020 kern.info kernel: [78283.634358] br-lan: port 3(wifi) entered listening state
Mon Sep 28 15:27:34 2020 kern.info kernel: [78285.663507] br-lan: port 3(wifi) entered learning state
Mon Sep 28 15:27:36 2020 kern.info kernel: [78287.679338] br-lan: port 3(wifi) entered forwarding state
Mon Sep 28 15:27:36 2020 kern.info kernel: [78287.684939] br-lan: topology change detected, sending tcn bpdu
Mon Sep 28 15:28:02 2020 kern.info kernel: [78313.981348] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:28:04 2020 kern.info kernel: [78316.028959] br-lan: port 3(wifi) neighbor 7fff.60:38:e0:xx:xx:xy lost
Mon Sep 28 15:28:04 2020 kern.info kernel: [78316.035630] br-lan: topology change detected, propagating
Mon Sep 28 15:28:35 2020 kern.info kernel: [78346.746657] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:29:06 2020 kern.info kernel: [78377.464074] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:29:36 2020 kern.info kernel: [78408.181527] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:30:07 2020 kern.info kernel: [78438.898977] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:30:40 2020 kern.info kernel: [78471.664287] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:31:11 2020 kern.info kernel: [78502.381794] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:31:41 2020 kern.info kernel: [78533.099298] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:32:12 2020 kern.info kernel: [78563.816824] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.518337] wifi: disassociated from 60:38:e0:xx:xx:xx (Reason: 4=DISASSOC_DUE_TO_INACTIVITY)
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.527312] br-lan: port 3(wifi) entered disabled state
Mon Sep 28 15:32:32 2020 daemon.notice netifd: Network device 'wifi' link is down
Mon Sep 28 15:32:32 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-DISCONNECTED bssid=60:38:e0:xx:xx:xx reason=4
Mon Sep 28 15:32:32 2020 daemon.notice wpa_supplicant[1436]: wifi: SME: Trying to authenticate with 60:38:e0:xx:xx:xx (SSID='wifi' freq=2432 MHz)
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.584070] wifi: authenticate with 60:38:e0:xx:xx:xx
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.603408] wifi: send auth to 60:38:e0:xx:xx:xx (try 1/3)
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.610306] wifi: authenticated
Mon Sep 28 15:32:32 2020 daemon.notice wpa_supplicant[1436]: wifi: Trying to associate with 60:38:e0:xx:xx:xx (SSID='wifi' freq=2432 MHz)
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.615079] wifi: associate with 60:38:e0:xx:xx:xx (try 1/3)
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.642902] wifi: RX AssocResp from 60:38:e0:xx:xx:xx (capab=0x431 status=0 aid=1)
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.651100] wifi: associated
Mon Sep 28 15:32:32 2020 daemon.notice netifd: Network device 'wifi' link is up
Mon Sep 28 15:32:32 2020 daemon.notice wpa_supplicant[1436]: wifi: Associated with 60:38:e0:xx:xx:xx
Mon Sep 28 15:32:32 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-SUBNET-STATUS-UPDATE status=0
Mon Sep 28 15:32:32 2020 daemon.notice wpa_supplicant[1436]: wifi: WPA: Key negotiation completed with 60:38:e0:xx:xx:xx [PTK=CCMP GTK=CCMP]
Mon Sep 28 15:32:32 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-CONNECTED - Connection to 60:38:e0:xx:xx:xx completed [id=0 id_str=]
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.662922] br-lan: port 3(wifi) entered blocking state
Mon Sep 28 15:32:32 2020 kern.info kernel: [78583.668372] br-lan: port 3(wifi) entered listening state
Mon Sep 28 15:32:34 2020 kern.info kernel: [78585.702846] br-lan: port 3(wifi) entered learning state
Mon Sep 28 15:32:36 2020 kern.info kernel: [78587.718684] br-lan: port 3(wifi) entered forwarding state
Mon Sep 28 15:32:36 2020 kern.info kernel: [78587.724275] br-lan: topology change detected, sending tcn bpdu
Mon Sep 28 15:33:03 2020 kern.info kernel: [78615.014679] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:33:36 2020 kern.info kernel: [78647.778120] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:34:09 2020 kern.info kernel: [78680.543528] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:34:40 2020 kern.info kernel: [78711.261101] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:35:10 2020 kern.info kernel: [78741.978703] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:35:43 2020 kern.info kernel: [78774.744145] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:36:16 2020 kern.info kernel: [78807.510701] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:36:47 2020 kern.info kernel: [78838.229555] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:37:17 2020 kern.info kernel: [78868.945609] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:37:50 2020 kern.info kernel: [78901.710407] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:38:23 2020 kern.info kernel: [78934.475985] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:38:54 2020 kern.info kernel: [78965.193562] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:39:24 2020 kern.info kernel: [78995.923202] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:39:57 2020 kern.info kernel: [79028.676806] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:40:28 2020 kern.info kernel: [79059.394517] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:40:58 2020 kern.info kernel: [79090.112246] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:41:29 2020 kern.info kernel: [79120.830012] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:42:02 2020 kern.info kernel: [79153.596715] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:42:33 2020 kern.info kernel: [79184.313338] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:43:03 2020 kern.info kernel: [79215.031325] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:43:34 2020 kern.info kernel: [79245.748885] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:44:00 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-BEACON-LOSS
Mon Sep 28 15:44:00 2020 kern.info kernel: [79271.699067] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:44:02 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-BEACON-LOSS
Mon Sep 28 15:44:02 2020 kern.info kernel: [79273.714936] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:44:03 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-BEACON-LOSS
Mon Sep 28 15:44:03 2020 kern.info kernel: [79274.707777] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:44:09 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-BEACON-LOSS
Mon Sep 28 15:44:09 2020 kern.info kernel: [79280.692306] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Mon Sep 28 15:44:10 2020 daemon.notice wpa_supplicant[1436]: wifi: CTRL-EVENT-BEACON-LOSS
Mon Sep 28 15:44:10 2020 kern.info kernel: [79281.715611] wifi: --- ack  probe_send_count: 1  vif.type 2:2


Good luck!

Georgi



> On 2020-09-27, at 6:10 PM, Georgi Valkov <gvalkov@abv.bg> wrote:
> 
> Hi Felix!
> 
> Here are some interesting tracing results based on v2. My tracing code is below.
> The only printed output comes from this line, so it looks like the other places were I added tracing were never executed.
> [11428.907823] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11459.636346] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11490.367121] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11523.141354] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11555.917399] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11586.644909] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11617.375977] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11650.147750] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11682.923378] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11713.649688] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11744.376130] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [11777.150797] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> 
> I just started testing v3, and so far it looks like this, but we need to wait and see if there will be any disconnect events:
> I wonder why we experience an increase in probe_send_count every 130 seconds?
> [  105.489078] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  136.204809] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  166.924520] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  199.690445] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  230.401303] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  261.110496] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  293.869304] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  324.580664] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  355.291914] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  386.002786] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  418.762544] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  449.473091] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  480.186644] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  512.941192] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [  545.700283] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  576.409210] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  607.119928] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  639.877866] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  670.588159] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [  701.298701] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> 
> 
> // patch.v2 + tracing
> Index: backports-5.8-1/net/mac80211/status.c
> ===================================================================
> --- backports-5.8-1.orig/net/mac80211/status.c
> +++ backports-5.8-1/net/mac80211/status.c
> @@ -1129,6 +1129,8 @@ void ieee80211_tx_status_ext(struct ieee
> 	noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
> 
> 	if (pubsta) {
> +		struct ieee80211_sub_if_data *sdata = sta->sdata;
> +
> 		if (!acked && !noack_success)
> 			sta->status_stats.retry_failed++;
> 		sta->status_stats.retry_count += retry_count;
> @@ -1143,6 +1145,31 @@ void ieee80211_tx_status_ext(struct ieee
> 				/* Track when last packet was ACKed */
> 				sta->status_stats.last_pkt_time = jiffies;
> 
> +				/* Reset connection monitor */
> +				if (sdata->vif.type == NL80211_IFTYPE_STATION &&
> +				    unlikely(sdata->u.mgd.probe_send_count > 0)) {
> +					struct ieee80211_hdr * hdr = (struct ieee80211_hdr *)sdata;
> +
> +					sdata_info(
> +						sdata,
> +						"reset connection monitor  probe_send_count: %u  nullfunc: %u\n",
> +						sdata->u.mgd.probe_send_count,
> +						ieee80211_is_any_nullfunc(hdr->frame_control)
> +					);
> +
> +					sdata->u.mgd.probe_send_count = 0;
> +					ieee80211_queue_work(&local->hw, &sdata->work);
> +				}
> +
> +				if (sdata->u.mgd.probe_send_count > 0)
> +				{
> +					sdata_info(
> +						sdata,
> +						"!!!  probe_send_count: %u\n",
> +						sdata->u.mgd.probe_send_count
> +					);
> +				}
> +
> 				if (info->status.is_valid_ack_signal) {
> 					sta->status_stats.last_ack_signal =
> 							 (s8)info->status.ack_signal;
> Index: backports-5.8-1/net/mac80211/mlme.c
> ===================================================================
> --- backports-5.8-1.orig/net/mac80211/mlme.c
> +++ backports-5.8-1/net/mac80211/mlme.c
> @@ -2505,13 +2505,46 @@ void ieee80211_sta_tx_notify(struct ieee
> 	ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time);
> 
> 	if (!ieee80211_is_any_nullfunc(hdr->frame_control) ||
> -	    !sdata->u.mgd.probe_send_count)
> +		!sdata->u.mgd.probe_send_count)
> +	{
> +		if (ack && sdata->u.mgd.probe_send_count)
> +		{
> +			sdata_info(
> +				sdata,
> +				"+++ ack  probe_send_count: %u  nullfunc_failed %u  vif.type %u:%u\n",
> +				sdata->u.mgd.probe_send_count,
> +				sdata->u.mgd.nullfunc_failed,
> +				sdata->vif.type,
> +				NL80211_IFTYPE_STATION
> +			);
> +		}
> +
> 		return;
> +	}
> 
> 	if (ack)
> +	{
> +		sdata_info(
> +			sdata,
> +			"--- ack  probe_send_count: %u  vif.type %u:%u\n",
> +			sdata->u.mgd.probe_send_count,
> +			sdata->vif.type,
> +			NL80211_IFTYPE_STATION
> +		);
> 		sdata->u.mgd.probe_send_count = 0;
> +	}
> 	else
> +	{
> +		sdata_info(
> +			sdata,
> +			"probe_send_count: %u  vif.type %u:%u\n",
> +			sdata->u.mgd.probe_send_count,
> +			sdata->vif.type,
> +			NL80211_IFTYPE_STATION
> +		);
> 		sdata->u.mgd.nullfunc_failed = true;
> +	}
> +
> 	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
> }
> 
> 
> 
> // 333-trace.patch for v3
> Index: backports-5.8-1/net/mac80211/mlme.c
> ===================================================================
> --- backports-5.8-1.orig/net/mac80211/mlme.c
> +++ backports-5.8-1/net/mac80211/mlme.c
> @@ -2505,13 +2505,46 @@ void ieee80211_sta_tx_notify(struct ieee
> 	ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time);
> 
> 	if (!ieee80211_is_any_nullfunc(hdr->frame_control) ||
> -	    !sdata->u.mgd.probe_send_count)
> +		!sdata->u.mgd.probe_send_count)
> +	{
> +		if (ack && sdata->u.mgd.probe_send_count)
> +		{
> +			sdata_info(
> +				sdata,
> +				"+++ ack  probe_send_count: %u  nullfunc_failed %u  vif.type %u:%u\n",
> +				sdata->u.mgd.probe_send_count,
> +				sdata->u.mgd.nullfunc_failed,
> +				sdata->vif.type,
> +				NL80211_IFTYPE_STATION
> +			);
> +		}
> +
> 		return;
> +	}
> 
> 	if (ack)
> +	{
> +		sdata_info(
> +			sdata,
> +			"--- ack  probe_send_count: %u  vif.type %u:%u\n",
> +			sdata->u.mgd.probe_send_count,
> +			sdata->vif.type,
> +			NL80211_IFTYPE_STATION
> +		);
> 		sdata->u.mgd.probe_send_count = 0;
> +	}
> 	else
> +	{
> +		sdata_info(
> +			sdata,
> +			"probe_send_count: %u  vif.type %u:%u\n",
> +			sdata->u.mgd.probe_send_count,
> +			sdata->vif.type,
> +			NL80211_IFTYPE_STATION
> +		);
> 		sdata->u.mgd.nullfunc_failed = true;
> +	}
> +
> 	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
> }
> 
> Index: backports-5.8-1/net/mac80211/status.c
> ===================================================================
> --- backports-5.8-1.orig/net/mac80211/status.c
> +++ backports-5.8-1/net/mac80211/status.c
> @@ -1148,8 +1148,28 @@ void ieee80211_tx_status_ext(struct ieee
> 
> 				/* Reset connection monitor */
> 				if (sdata->vif.type == NL80211_IFTYPE_STATION &&
> -				    unlikely(sdata->u.mgd.probe_send_count > 0))
> +					unlikely(sdata->u.mgd.probe_send_count > 0))
> +				{
> +					struct ieee80211_hdr * hdr = (struct ieee80211_hdr *)sdata;
> +
> +					sdata_info(
> +						sdata,
> +						"reset connection monitor  probe_send_count: %u  nullfunc: %u\n",
> +						sdata->u.mgd.probe_send_count,
> +						ieee80211_is_any_nullfunc(hdr->frame_control)
> +					);
> +
> 					sdata->u.mgd.probe_send_count = 0;
> +				}
> +
> +				if (sdata->u.mgd.probe_send_count > 0)
> +				{
> +					sdata_info(
> +						sdata,
> +						"!!!  probe_send_count: %u\n",
> +						sdata->u.mgd.probe_send_count
> +					);
> +				}
> 
> 				if (info->status.is_valid_ack_signal) {
> 					sta->status_stats.last_ack_signal =
> 
> 
> 
>> On 2020-09-27, at 1:56 PM, Felix Fietkau <nbd@nbd.name> wrote:
>> 
>> When a frame was acked and probe frames were sent, the connection monitoring
>> needs to be reset, otherwise it will keep probing until the connection is
>> considered dead, even though frames have been acked in the mean time.
>> 
>> Fixes: 9abf4e49830d ("mac80211: optimize station connection monitor")
>> Reported-by: Georgi Valkov <gvalkov@abv.bg>
>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>> ---
>> v3: only queue work on nullfunc frames status
>> v2: reset connection monitor when a frame was acked (not just for nulldata)
>> 
>> net/mac80211/mlme.c   |  4 +++-
>> net/mac80211/status.c | 16 ++++++++++++----
>> 2 files changed, 15 insertions(+), 5 deletions(-)
>> 
>> diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
>> index 50a9b9025725..7c04d8e30482 100644
>> --- a/net/mac80211/mlme.c
>> +++ b/net/mac80211/mlme.c
>> @@ -2508,7 +2508,9 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
>> 	    !sdata->u.mgd.probe_send_count)
>> 		return;
>> 
>> -	if (!ack)
>> +	if (ack)
>> +		sdata->u.mgd.probe_send_count = 0;
>> +	else
>> 		sdata->u.mgd.nullfunc_failed = true;
>> 	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
>> }
>> diff --git a/net/mac80211/status.c b/net/mac80211/status.c
>> index 7fe5bececfd9..88a736f3c413 100644
>> --- a/net/mac80211/status.c
>> +++ b/net/mac80211/status.c
>> @@ -982,10 +982,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
>> 		if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
>> 			ieee80211_frame_acked(sta, skb);
>> 
>> -		if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) &&
>> -		    ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
>> -			ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data,
>> -						acked, info->status.tx_time);
>> 	}
>> 
>> 	/* SNMP counters
>> @@ -1120,11 +1116,18 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
>> 	noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
>> 
>> 	if (pubsta) {
>> +		struct ieee80211_sub_if_data *sdata = sta->sdata;
>> +
>> 		if (!acked && !noack_success)
>> 			sta->status_stats.retry_failed++;
>> 		sta->status_stats.retry_count += retry_count;
>> 
>> 		if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
>> +			if (sdata->vif.type == NL80211_IFTYPE_STATION &&
>> +			    skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP))
>> +				ieee80211_sta_tx_notify(sdata, (void *) skb->data,
>> +							acked, info->status.tx_time);
>> +
>> 			if (acked) {
>> 				sta->status_stats.last_ack = jiffies;
>> 
>> @@ -1134,6 +1137,11 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
>> 				/* Track when last packet was ACKed */
>> 				sta->status_stats.last_pkt_time = jiffies;
>> 
>> +				/* Reset connection monitor */
>> +				if (sdata->vif.type == NL80211_IFTYPE_STATION &&
>> +				    unlikely(sdata->u.mgd.probe_send_count > 0))
>> +					sdata->u.mgd.probe_send_count = 0;
>> +
>> 				if (info->status.is_valid_ack_signal) {
>> 					sta->status_stats.last_ack_signal =
>> 							 (s8)info->status.ack_signal;
>> -- 
>> 2.28.0
>> 
>> 
> 
>
Felix Fietkau Sept. 29, 2020, 12:13 p.m. UTC | #3
On 2020-09-28 19:09, Georgi Valkov wrote:
> Hi Felix!
> 
> Unfortunately it disconnected again after 21 hours and 44 minutes of uptime, and remained offline for 5 minutes again.
> I will add some code to ieee80211_sta_work() to trace more info when a disconnect occurs.
> Do you have any ideas? Why does it remain offline for exactly 5 minutes every time? Do we need to reset probe_send_count in rx.c? Do we have to send probe requests or nullfunc while there is active communication, e.g. acknowledged ICMP traffic?
> 
> This is the most common event in the trace, it occurs every 30 seconds, we called ieee80211_sta_tx_notify(ack: true), and ieee80211_is_any_nullfunc() returned true, so probe_send_count is reset:
> --- ack  probe_send_count: 1  vif.type 2:2
> 
> These events are rare, I could not find any pattern. probe_send_count is reset inside status.c
> reset connection monitor  probe_send_count: 1  nullfunc: 0
> 
> These events are rare and appear randomly. The first indicates that we called ieee80211_sta_tx_notify(ack: false), and ieee80211_is_any_nullfunc() returned true, so probe_send_count is not reset, while the second always appears 10ms later with ack = true, and ieee80211_is_any_nullfunc() returned true, so probe_send_count is reset.
> probe_send_count: 1  vif.type 2:2
> --- ack  probe_send_count: 2  vif.type 2:2
So far I don't see any strong indication that these disconnects are
related to this patch. The debug messages you added seem normal to me.
More comments on the disconnects below.

> Here are some interesting parts from the trace:
> 
> [STA kernel log]
> [55045.254528] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [55075.971445] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [55106.699439] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> ...
> [57812.177775] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [57842.894483] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
> [57873.610852] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> ...
> [78172.681502] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78203.398703] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78234.118347] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78264.834126] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78282.530133] wifi: deauthenticated from 60:38:e0:xx:xx:xx (Reason: 16=GROUP_KEY_HANDSHAKE_TIMEOUT)
This one is definitely not related to the connection monitor.

> [78282.539463] br-lan: port 3(wifi) entered disabled state
> [78282.544987] br-lan: topology change detected, propagating
> [78283.514078] wifi: authenticate with 60:38:e0:xx:xx:xx
> [78283.533405] wifi: send auth to 60:38:e0:xx:xx:xx (try 1/3)
> [78283.577391] wifi: authenticated
> [78283.583874] wifi: associate with 60:38:e0:xx:xx:xx (try 1/3)
> [78283.610166] wifi: RX AssocResp from 60:38:e0:xx:xx:xx (capab=0x431 status=0 aid=1)
> [78283.618322] wifi: associated
> [78283.628908] br-lan: port 3(wifi) entered blocking state
> [78283.634358] br-lan: port 3(wifi) entered listening state
> [78285.663507] br-lan: port 3(wifi) entered learning state
> [78287.679338] br-lan: port 3(wifi) entered forwarding state
> [78287.684939] br-lan: topology change detected, sending tcn bpdu
> [78313.981348] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78316.028959] br-lan: port 3(wifi) neighbor 7fff.60:38:e0:xx:xx:xy lost
> [78316.035630] br-lan: topology change detected, propagating
> [78346.746657] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78377.464074] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78408.181527] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78438.898977] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78471.664287] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78502.381794] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78533.099298] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78563.816824] wifi: --- ack  probe_send_count: 1  vif.type 2:2
> [78583.518337] wifi: disassociated from 60:38:e0:xx:xx:xx (Reason: 4=DISASSOC_DUE_TO_INACTIVITY)
Are you sure that this one is even related to the connection monitor? It
seems to me that the client may have been kicked from the AP due to
inactivity (might be AP probing, not STA probing).

- Felix
Georgi Valkov Sept. 29, 2020, 2:39 p.m. UTC | #4
Hi Felix!
I was also beginning to think that the disconnect where "group key handshake failed" is unrelated, because v3 of your patch resets probe_send_count consistently. In the rare event when the nullfunc is not acknowledged the first time due to a packet loss, it is resent and acknowleged a few miliseconds later. During a disconnect, the locations in ieee80211_sta_work() which print "No ack for nullfunc [... disconnecting]" are not called. I added printk to be sure. Comparing the logs during that disconnect, according to the AP: group key handshake failed (RSN) after 4 tries, while the STA thinks that: Group rekeying completed, and then gets kicked off the network. Then it tries to authenticate immediately, and both sides think authentication is successful, with the tiny detail that the AP prints "authenticated" 19 seconds later, and then immediately deletes the interface: phy1: staid 1 deleted. Yet for some reason the following 5 minutes the AP replays to nullfunc from STA and STA think everything is good. Something is wrong here.
1. Isn't the nullfunc supposed to fail, so the STA gets kicked off faster and reconnect?
2. Why does the group key handshake fail in first place? Could this be related to the KRACK countermeasurement which is supported and enabled in the AP (WRT3200ACM has a newer radio), but is probably not supported (and not listed in Luci) on the old router (1043ND v4). Or is this some bug? Both are compiled from the same commit (a few days ago on master).

At this pont I suggest you commit v3 and mark the original issue as resolved. I will be very thankful if you help me resolve the group handshake or why the STA remains offline for 5 minutes.

Thank you for being with me, and please let me know what you think about the other e-mails I sent:
1. json-c: Embed RPATH on macOS to prevent error in ucert: dyld: Library not loaded: libjson-c.5.dylib
2. Building on macOS with Xcode 12, clang 12 - sent only to the mail listing, let me know if I should forward a copy to you.

Good luck!

Georgi


[AP system log]
Tue Sep 29 15:27:04 2020 daemon.info hostapd: wifi_5g: STA 38:f9:d3:aa:aa:aa WPA: group key handshake completed (RSN)
Tue Sep 29 15:27:04 2020 daemon.info hostapd: wifi_5g: STA d0:c5:f3:bb:bb:bb WPA: group key handshake completed (RSN)
Tue Sep 29 15:27:07 2020 daemon.info hostapd: wifi: STA 98:de:d0:xx:xx:xx WPA: group key handshake failed (RSN) after 4 tries
Tue Sep 29 15:27:07 2020 daemon.notice hostapd: wifi: AP-STA-DISCONNECTED 98:de:d0:xx:xx:xx
Tue Sep 29 15:27:08 2020 kern.debug kernel: [432066.746438] ieee80211 phy1: staid 1 deleted
Tue Sep 29 15:27:08 2020 daemon.info hostapd: wifi: STA 98:de:d0:xx:xx:xx IEEE 802.11: associated (aid 1)
Tue Sep 29 15:27:08 2020 daemon.notice hostapd: wifi: AP-STA-CONNECTED 98:de:d0:xx:xx:xx
Tue Sep 29 15:27:08 2020 daemon.info hostapd: wifi: STA 98:de:d0:xx:xx:xx WPA: pairwise key handshake completed (RSN)
Tue Sep 29 15:27:27 2020 daemon.info hostapd: wifi: STA 98:de:d0:xx:xx:xx IEEE 802.11: authenticated
Tue Sep 29 15:27:27 2020 kern.debug kernel: [432086.546406] ieee80211 phy1: staid 1 deleted
Tue Sep 29 15:32:08 2020 daemon.notice hostapd: wifi: AP-STA-DISCONNECTED 98:de:d0:xx:xx:xx
Tue Sep 29 15:32:08 2020 daemon.info hostapd: wifi: STA 98:de:d0:xx:xx:xx IEEE 802.11: disassociated due to inactivity
Tue Sep 29 15:32:08 2020 daemon.info hostapd: wifi: STA 98:de:d0:xx:xx:xx IEEE 802.11: associated (aid 1)
Tue Sep 29 15:32:08 2020 daemon.notice hostapd: wifi: AP-STA-CONNECTED 98:de:d0:xx:xx:xx
Tue Sep 29 15:32:08 2020 daemon.info hostapd: wifi: STA 98:de:d0:xx:xx:xx WPA: pairwise key handshake completed (RSN)
Tue Sep 29 15:32:18 2020 daemon.info hostapd: wifi: STA 98:de:d0:xx:xx:xx IEEE 802.11: authenticated

[STA system log]
Tue Sep 29 14:55:14 2020 kern.debug kernel: [58169.232195] probe_send_count++ 1
Tue Sep 29 14:55:14 2020 kern.debug kernel: [58169.235542] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 14:55:14 2020 kern.info kernel: [58169.246499] wifi: probe_send_count: 1  vif.type 2:2
Tue Sep 29 14:55:14 2020 kern.warn kernel: [58169.251875] No ack for nullfunc frame to AP 60:38:e0:cd:1c:91, try 1/2
Tue Sep 29 14:55:14 2020 kern.debug kernel: [58169.258652] probe_send_count++ 2
Tue Sep 29 14:55:14 2020 kern.debug kernel: [58169.261989] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 2
Tue Sep 29 14:55:14 2020 kern.info kernel: [58169.271198] ivan: --- ack  probe_send_count: 2  vif.type 2:2

Tue Sep 29 15:25:40 2020 kern.debug kernel: [59996.019411] probe_send_count++ 1
Tue Sep 29 15:25:40 2020 kern.debug kernel: [59996.022750] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:25:40 2020 kern.info kernel: [59996.031966] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:26:11 2020 kern.debug kernel: [60026.738838] probe_send_count++ 1
Tue Sep 29 15:26:11 2020 kern.debug kernel: [60026.742181] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:26:11 2020 kern.info kernel: [60026.751408] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:26:44 2020 kern.debug kernel: [60059.506214] probe_send_count++ 1
Tue Sep 29 15:26:44 2020 kern.debug kernel: [60059.509552] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:26:44 2020 kern.info kernel: [60059.521346] wifi: --- ack  probe_send_count: 1  vif.type 2:2

Tue Sep 29 15:27:03 2020 daemon.notice wpa_supplicant[1319]: wifi: WPA: Group rekeying completed with 60:38:e0:yy:yy:yy [GTK=CCMP]
Tue Sep 29 15:27:07 2020 kern.info kernel: [60082.764451] wifi: deauthenticated from 60:38:e0:yy:yy:yy (Reason: 16=GROUP_KEY_HANDSHAKE_TIMEOUT)
Tue Sep 29 15:27:07 2020 kern.info kernel: [60082.773803] br-lan: port 3(wifi) entered disabled state
Tue Sep 29 15:27:07 2020 kern.info kernel: [60082.779299] br-lan: topology change detected, propagating
Tue Sep 29 15:27:07 2020 daemon.notice netifd: Network device 'wifi' link is down
Tue Sep 29 15:27:07 2020 daemon.notice wpa_supplicant[1319]: wifi: CTRL-EVENT-DISCONNECTED bssid=60:38:e0:yy:yy:yy reason=16
Tue Sep 29 15:27:08 2020 daemon.notice wpa_supplicant[1319]: wifi: SME: Trying to authenticate with 60:38:e0:yy:yy:yy (SSID='wifi' freq=2432 MHz)
Tue Sep 29 15:27:08 2020 kern.info kernel: [60083.747939] wifi: authenticate with 60:38:e0:yy:yy:yy
Tue Sep 29 15:27:08 2020 kern.info kernel: [60083.767269] wifi: send auth to 60:38:e0:yy:yy:yy (try 1/3)
Tue Sep 29 15:27:08 2020 daemon.notice wpa_supplicant[1319]: wifi: Trying to associate with 60:38:e0:yy:yy:yy (SSID='wifi' freq=2432 MHz)
Tue Sep 29 15:27:08 2020 kern.info kernel: [60083.808370] wifi: authenticated
Tue Sep 29 15:27:08 2020 kern.info kernel: [60083.813911] wifi: associate with 60:38:e0:yy:yy:yy (try 1/3)
Tue Sep 29 15:27:08 2020 kern.info kernel: [60083.838239] wifi: RX AssocResp from 60:38:e0:yy:yy:yy (capab=0x431 status=0 aid=1)
Tue Sep 29 15:27:08 2020 kern.info kernel: [60083.846402] wifi: associated
Tue Sep 29 15:27:08 2020 daemon.notice netifd: Network device 'wifi' link is up
Tue Sep 29 15:27:08 2020 daemon.notice wpa_supplicant[1319]: wifi: Associated with 60:38:e0:yy:yy:yy
Tue Sep 29 15:27:08 2020 daemon.notice wpa_supplicant[1319]: wifi: CTRL-EVENT-SUBNET-STATUS-UPDATE status=0
Tue Sep 29 15:27:08 2020 daemon.notice wpa_supplicant[1319]: wifi: WPA: Key negotiation completed with 60:38:e0:yy:yy:yy [PTK=CCMP GTK=CCMP]
Tue Sep 29 15:27:08 2020 daemon.notice wpa_supplicant[1319]: wifi: CTRL-EVENT-CONNECTED - Connection to 60:38:e0:yy:yy:yy completed [id=0 id_str=]
Tue Sep 29 15:27:08 2020 kern.info kernel: [60083.858846] br-lan: port 3(wifi) entered blocking state
Tue Sep 29 15:27:08 2020 kern.info kernel: [60083.864305] br-lan: port 3(wifi) entered listening state
Tue Sep 29 15:27:10 2020 kern.info kernel: [60085.873708] br-lan: port 3(wifi) entered learning state
Tue Sep 29 15:27:12 2020 kern.info kernel: [60087.889670] br-lan: port 3(wifi) entered forwarding state
Tue Sep 29 15:27:12 2020 kern.info kernel: [60087.895262] br-lan: topology change detected, sending tcn bpdu
Tue Sep 29 15:27:39 2020 kern.debug kernel: [60114.801162] probe_send_count++ 1
Tue Sep 29 15:27:39 2020 kern.debug kernel: [60114.804506] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:27:39 2020 kern.info kernel: [60114.813722] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:27:47 2020 kern.info kernel: [60122.993006] br-lan: port 3(wifi) neighbor 7fff.60:38:e0:yy:yy:yz lost
Tue Sep 29 15:27:47 2020 kern.info kernel: [60122.999677] br-lan: topology change detected, propagating
Tue Sep 29 15:28:12 2020 kern.debug kernel: [60147.568535] probe_send_count++ 1
Tue Sep 29 15:28:12 2020 kern.debug kernel: [60147.571880] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:28:12 2020 kern.info kernel: [60147.581094] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:28:43 2020 kern.debug kernel: [60178.287944] probe_send_count++ 1
Tue Sep 29 15:28:43 2020 kern.debug kernel: [60178.291284] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:28:43 2020 kern.info kernel: [60178.306328] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:29:13 2020 kern.debug kernel: [60209.007373] probe_send_count++ 1
Tue Sep 29 15:29:13 2020 kern.debug kernel: [60209.010716] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:29:13 2020 kern.info kernel: [60209.019935] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:29:44 2020 kern.debug kernel: [60239.727155] probe_send_count++ 1
Tue Sep 29 15:29:44 2020 kern.debug kernel: [60239.730497] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:29:44 2020 kern.info kernel: [60239.739739] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:30:17 2020 kern.debug kernel: [60272.494118] probe_send_count++ 1
Tue Sep 29 15:30:17 2020 kern.debug kernel: [60272.497466] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:30:17 2020 kern.info kernel: [60272.506680] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:30:48 2020 kern.debug kernel: [60303.213517] probe_send_count++ 1
Tue Sep 29 15:30:48 2020 kern.debug kernel: [60303.216860] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:30:48 2020 kern.info kernel: [60303.226077] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:31:18 2020 kern.debug kernel: [60333.932939] probe_send_count++ 1
Tue Sep 29 15:31:18 2020 kern.debug kernel: [60333.936281] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:31:18 2020 kern.info kernel: [60333.947000] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:31:51 2020 kern.debug kernel: [60366.700269] probe_send_count++ 1
Tue Sep 29 15:31:51 2020 kern.debug kernel: [60366.703610] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:31:51 2020 kern.info kernel: [60366.712923] wifi: --- ack  probe_send_count: 1  vif.type 2:2

Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.727820] wifi: disassociated from 60:38:e0:yy:yy:yy (Reason: 4=DISASSOC_DUE_TO_INACTIVITY)
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.737620] br-lan: port 3(wifi) entered disabled state
Tue Sep 29 15:32:08 2020 daemon.notice netifd: Network device 'wifi' link is down
Tue Sep 29 15:32:08 2020 daemon.notice wpa_supplicant[1319]: wifi: CTRL-EVENT-DISCONNECTED bssid=60:38:e0:yy:yy:yy reason=4
Tue Sep 29 15:32:08 2020 daemon.notice wpa_supplicant[1319]: wifi: SME: Trying to authenticate with 60:38:e0:yy:yy:yy (SSID='wifi' freq=2432 MHz)
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.792996] wifi: authenticate with 60:38:e0:yy:yy:yy
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.812320] wifi: send auth to 60:38:e0:yy:yy:yy (try 1/3)
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.819596] wifi: authenticated
Tue Sep 29 15:32:08 2020 daemon.notice wpa_supplicant[1319]: wifi: Trying to associate with 60:38:e0:yy:yy:yy (SSID='wifi' freq=2432 MHz)
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.824009] wifi: associate with 60:38:e0:yy:yy:yy (try 1/3)
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.851498] wifi: RX AssocResp from 60:38:e0:yy:yy:yy (capab=0x431 status=0 aid=1)
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.859681] wifi: associated
Tue Sep 29 15:32:08 2020 daemon.notice wpa_supplicant[1319]: wifi: Associated with 60:38:e0:yy:yy:yy
Tue Sep 29 15:32:08 2020 daemon.notice wpa_supplicant[1319]: wifi: CTRL-EVENT-SUBNET-STATUS-UPDATE status=0
Tue Sep 29 15:32:08 2020 daemon.notice netifd: Network device 'wifi' link is up
Tue Sep 29 15:32:08 2020 daemon.notice wpa_supplicant[1319]: wifi: WPA: Key negotiation completed with 60:38:e0:yy:yy:yy [PTK=CCMP GTK=CCMP]
Tue Sep 29 15:32:08 2020 daemon.notice wpa_supplicant[1319]: wifi: CTRL-EVENT-CONNECTED - Connection to 60:38:e0:yy:yy:yy completed [id=0 id_str=]
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.872438] br-lan: port 3(wifi) entered blocking state
Tue Sep 29 15:32:08 2020 kern.info kernel: [60383.877894] br-lan: port 3(wifi) entered listening state
Tue Sep 29 15:32:10 2020 kern.info kernel: [60385.899887] br-lan: port 3(wifi) entered learning state
Tue Sep 29 15:32:12 2020 kern.info kernel: [60387.915843] br-lan: port 3(wifi) entered forwarding state
Tue Sep 29 15:32:12 2020 kern.info kernel: [60387.921434] br-lan: topology change detected, propagating
Tue Sep 29 15:32:40 2020 kern.debug kernel: [60415.851325] probe_send_count++ 1
Tue Sep 29 15:32:40 2020 kern.debug kernel: [60415.854666] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:32:40 2020 kern.info kernel: [60415.864374] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:33:13 2020 kern.debug kernel: [60448.618649] probe_send_count++ 1
Tue Sep 29 15:33:13 2020 kern.debug kernel: [60448.621993] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:33:13 2020 kern.info kernel: [60448.631213] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:33:44 2020 kern.debug kernel: [60479.338025] probe_send_count++ 1
Tue Sep 29 15:33:44 2020 kern.debug kernel: [60479.341370] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:33:44 2020 kern.info kernel: [60479.350602] wifi: --- ack  probe_send_count: 1  vif.type 2:2
Tue Sep 29 15:34:15 2020 kern.debug kernel: [60510.057408] probe_send_count++ 1
Tue Sep 29 15:34:15 2020 kern.debug kernel: [60510.060748] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
Tue Sep 29 15:34:15 2020 kern.info kernel: [60510.075731] wifi: --- ack  probe_send_count: 1  vif.type 2:2

[STA kernel log]
[59996.019411] probe_send_count++ 1
[59996.022750] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[59996.031966] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60026.738838] probe_send_count++ 1
[60026.742181] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60026.751408] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60059.506214] probe_send_count++ 1
[60059.509552] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60059.521346] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60082.764451] wifi: deauthenticated from 60:38:e0:yy:yy:yy (Reason: 16=GROUP_KEY_HANDSHAKE_TIMEOUT)
[60082.773803] br-lan: port 3(wifi) entered disabled state
[60082.779299] br-lan: topology change detected, propagating
[60083.747939] wifi: authenticate with 60:38:e0:yy:yy:yy
[60083.767269] wifi: send auth to 60:38:e0:yy:yy:yy (try 1/3)
[60083.808370] wifi: authenticated
[60083.813911] wifi: associate with 60:38:e0:yy:yy:yy (try 1/3)
[60083.838239] wifi: RX AssocResp from 60:38:e0:yy:yy:yy (capab=0x431 status=0 aid=1)
[60083.846402] wifi: associated
[60083.858846] br-lan: port 3(wifi) entered blocking state
[60083.864305] br-lan: port 3(wifi) entered listening state
[60085.873708] br-lan: port 3(wifi) entered learning state
[60087.889670] br-lan: port 3(wifi) entered forwarding state
[60087.895262] br-lan: topology change detected, sending tcn bpdu
[60114.801162] probe_send_count++ 1
[60114.804506] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60114.813722] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60122.993006] br-lan: port 3(wifi) neighbor 7fff.60:38:e0:yy:yy:yz lost
[60122.999677] br-lan: topology change detected, propagating
[60147.568535] probe_send_count++ 1
[60147.571880] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60147.581094] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60178.287944] probe_send_count++ 1
[60178.291284] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60178.306328] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60209.007373] probe_send_count++ 1
[60209.010716] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60209.019935] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60239.727155] probe_send_count++ 1
[60239.730497] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60239.739739] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60272.494118] probe_send_count++ 1
[60272.497466] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60272.506680] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60303.213517] probe_send_count++ 1
[60303.216860] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60303.226077] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60333.932939] probe_send_count++ 1
[60333.936281] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60333.947000] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60366.700269] probe_send_count++ 1
[60366.703610] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60366.712923] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60383.727820] wifi: disassociated from 60:38:e0:yy:yy:yy (Reason: 4=DISASSOC_DUE_TO_INACTIVITY)
[60383.737620] br-lan: port 3(wifi) entered disabled state
[60383.792996] wifi: authenticate with 60:38:e0:yy:yy:yy
[60383.812320] wifi: send auth to 60:38:e0:yy:yy:yy (try 1/3)
[60383.819596] wifi: authenticated
[60383.824009] wifi: associate with 60:38:e0:yy:yy:yy (try 1/3)
[60383.851498] wifi: RX AssocResp from 60:38:e0:yy:yy:yy (capab=0x431 status=0 aid=1)
[60383.859681] wifi: associated
[60383.872438] br-lan: port 3(wifi) entered blocking state
[60383.877894] br-lan: port 3(wifi) entered listening state
[60385.899887] br-lan: port 3(wifi) entered learning state
[60387.915843] br-lan: port 3(wifi) entered forwarding state
[60387.921434] br-lan: topology change detected, propagating
[60415.851325] probe_send_count++ 1
[60415.854666] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60415.864374] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60448.618649] probe_send_count++ 1
[60448.621993] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60448.631213] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60479.338025] probe_send_count++ 1
[60479.341370] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60479.350602] wifi: --- ack  probe_send_count: 1  vif.type 2:2
[60510.057408] probe_send_count++ 1
[60510.060748] ieee80211_send_nullfunc(false) from ieee80211_mgd_probe_ap_send, probe_send_count 1
[60510.075731] wifi: --- ack  probe_send_count: 1  vif.type 2:2




> On 2020-09-29, at 3:13 PM, Felix Fietkau <nbd@nbd.name> wrote:
> 
> 
> On 2020-09-28 19:09, Georgi Valkov wrote:
>> Hi Felix!
>> 
>> Unfortunately it disconnected again after 21 hours and 44 minutes of uptime, and remained offline for 5 minutes again.
>> I will add some code to ieee80211_sta_work() to trace more info when a disconnect occurs.
>> Do you have any ideas? Why does it remain offline for exactly 5 minutes every time? Do we need to reset probe_send_count in rx.c? Do we have to send probe requests or nullfunc while there is active communication, e.g. acknowledged ICMP traffic?
>> 
>> This is the most common event in the trace, it occurs every 30 seconds, we called ieee80211_sta_tx_notify(ack: true), and ieee80211_is_any_nullfunc() returned true, so probe_send_count is reset:
>> --- ack  probe_send_count: 1  vif.type 2:2
>> 
>> These events are rare, I could not find any pattern. probe_send_count is reset inside status.c
>> reset connection monitor  probe_send_count: 1  nullfunc: 0
>> 
>> These events are rare and appear randomly. The first indicates that we called ieee80211_sta_tx_notify(ack: false), and ieee80211_is_any_nullfunc() returned true, so probe_send_count is not reset, while the second always appears 10ms later with ack = true, and ieee80211_is_any_nullfunc() returned true, so probe_send_count is reset.
>> probe_send_count: 1  vif.type 2:2
>> --- ack  probe_send_count: 2  vif.type 2:2
> So far I don't see any strong indication that these disconnects are
> related to this patch. The debug messages you added seem normal to me.
> More comments on the disconnects below.
> 
>> Here are some interesting parts from the trace:
>> 
>> [STA kernel log]
>> [55045.254528] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [55075.971445] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
>> [55106.699439] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> ...
>> [57812.177775] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [57842.894483] wifi: reset connection monitor  probe_send_count: 1  nullfunc: 0
>> [57873.610852] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> ...
>> [78172.681502] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78203.398703] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78234.118347] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78264.834126] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78282.530133] wifi: deauthenticated from 60:38:e0:xx:xx:xx (Reason: 16=GROUP_KEY_HANDSHAKE_TIMEOUT)
> This one is definitely not related to the connection monitor.
> 
>> [78282.539463] br-lan: port 3(wifi) entered disabled state
>> [78282.544987] br-lan: topology change detected, propagating
>> [78283.514078] wifi: authenticate with 60:38:e0:xx:xx:xx
>> [78283.533405] wifi: send auth to 60:38:e0:xx:xx:xx (try 1/3)
>> [78283.577391] wifi: authenticated
>> [78283.583874] wifi: associate with 60:38:e0:xx:xx:xx (try 1/3)
>> [78283.610166] wifi: RX AssocResp from 60:38:e0:xx:xx:xx (capab=0x431 status=0 aid=1)
>> [78283.618322] wifi: associated
>> [78283.628908] br-lan: port 3(wifi) entered blocking state
>> [78283.634358] br-lan: port 3(wifi) entered listening state
>> [78285.663507] br-lan: port 3(wifi) entered learning state
>> [78287.679338] br-lan: port 3(wifi) entered forwarding state
>> [78287.684939] br-lan: topology change detected, sending tcn bpdu
>> [78313.981348] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78316.028959] br-lan: port 3(wifi) neighbor 7fff.60:38:e0:xx:xx:xy lost
>> [78316.035630] br-lan: topology change detected, propagating
>> [78346.746657] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78377.464074] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78408.181527] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78438.898977] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78471.664287] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78502.381794] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78533.099298] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78563.816824] wifi: --- ack  probe_send_count: 1  vif.type 2:2
>> [78583.518337] wifi: disassociated from 60:38:e0:xx:xx:xx (Reason: 4=DISASSOC_DUE_TO_INACTIVITY)
> Are you sure that this one is even related to the connection monitor? It
> seems to me that the client may have been kicked from the AP due to
> inactivity (might be AP probing, not STA probing).
> 
> - Felix
diff mbox series

Patch

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 50a9b9025725..7c04d8e30482 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2508,7 +2508,9 @@  void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
 	    !sdata->u.mgd.probe_send_count)
 		return;
 
-	if (!ack)
+	if (ack)
+		sdata->u.mgd.probe_send_count = 0;
+	else
 		sdata->u.mgd.nullfunc_failed = true;
 	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
 }
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 7fe5bececfd9..88a736f3c413 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -982,10 +982,6 @@  static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 		if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
 			ieee80211_frame_acked(sta, skb);
 
-		if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) &&
-		    ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
-			ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data,
-						acked, info->status.tx_time);
 	}
 
 	/* SNMP counters
@@ -1120,11 +1116,18 @@  void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 	noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
 
 	if (pubsta) {
+		struct ieee80211_sub_if_data *sdata = sta->sdata;
+
 		if (!acked && !noack_success)
 			sta->status_stats.retry_failed++;
 		sta->status_stats.retry_count += retry_count;
 
 		if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
+			if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+			    skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP))
+				ieee80211_sta_tx_notify(sdata, (void *) skb->data,
+							acked, info->status.tx_time);
+
 			if (acked) {
 				sta->status_stats.last_ack = jiffies;
 
@@ -1134,6 +1137,11 @@  void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
 				/* Track when last packet was ACKed */
 				sta->status_stats.last_pkt_time = jiffies;
 
+				/* Reset connection monitor */
+				if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+				    unlikely(sdata->u.mgd.probe_send_count > 0))
+					sdata->u.mgd.probe_send_count = 0;
+
 				if (info->status.is_valid_ack_signal) {
 					sta->status_stats.last_ack_signal =
 							 (s8)info->status.ack_signal;