diff mbox

[v9,2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue

Message ID 20160922170420.5193-3-toke@toke.dk (mailing list archive)
State Accepted
Delegated to: Johannes Berg
Headers show

Commit Message

Toke Høiland-Jørgensen Sept. 22, 2016, 5:04 p.m. UTC
The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases all over the place
for when TXQ is enabled, at the cost of making the fast_xmit and TX
handler code slightly more complex.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   8 ++
 net/mac80211/rx.c          |   4 +-
 net/mac80211/sta_info.c    |  10 +-
 net/mac80211/tx.c          | 287 +++++++++++++++++++++++++++++++++------------
 net/mac80211/util.c        |  11 +-
 6 files changed, 232 insertions(+), 90 deletions(-)

Comments

Johannes Berg Sept. 30, 2016, 10:27 a.m. UTC | #1
Hi Toke,

Sorry for the delay reviewing this.

I think I still have a few comments/questions.

> +static inline bool txq_has_queue(struct ieee80211_txq *txq)
> +{
> +	struct txq_info *txqi = to_txq_info(txq);
> +	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
> +}

Tiny nit - there should probably be a blank line between the two lines
here, but I could just fix that when I apply if you don't resend anyway
for some other reason.

[snip helper stuff that looks fine]

> -	if (!tx->sta->sta.txq[0])
> -		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
> +	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);

Just to make sure I get this right - this is because the handler is now
run on dequeue, so the special case is no longer needed?

>  #define CALL_TXH(txh) \
> @@ -1656,6 +1684,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>  	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>  		CALL_TXH(ieee80211_tx_h_rate_ctrl);

Just for reference - the code block here that's unchanged contains
this:

        CALL_TXH(ieee80211_tx_h_dynamic_ps);
        CALL_TXH(ieee80211_tx_h_check_assoc);
        CALL_TXH(ieee80211_tx_h_ps_buf);
        CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
        CALL_TXH(ieee80211_tx_h_select_key);
        if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
                CALL_TXH(ieee80211_tx_h_rate_ctrl);

> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
> +{
> +	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
> +	ieee80211_tx_result res = TX_CONTINUE;
> +
>  	if (unlikely(info->flags &
> IEEE80211_TX_INTFL_RETRANSMISSION)) {
>  		__skb_queue_tail(&tx->skbs, tx->skb);
>  		tx->skb = NULL;

And this code here is also unchanged from the original TX handler
invocation, so contains this:

        if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
                __skb_queue_tail(&tx->skbs, tx->skb);
                tx->skb = NULL;
                goto txh_done;
        }

        CALL_TXH(ieee80211_tx_h_michael_mic_add);
        CALL_TXH(ieee80211_tx_h_sequence);
        CALL_TXH(ieee80211_tx_h_fragment);
        /* handlers after fragment must be aware of tx info fragmentation! */
        CALL_TXH(ieee80211_tx_h_stats);
        CALL_TXH(ieee80211_tx_h_encrypt);
        if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
                CALL_TXH(ieee80211_tx_h_calculate_duration);

But now you have a problem (that you solved) that the key pointer can
be invalidated while you have the packet queued between the two points,
and then the tx_h_michael_mic_add and/or tx_h_encrypt would crash.

You solve this by re-running tx_h_select_key() on dequeue, but it's not
clear to me why you didn't move that to the late handlers instead?

I *think* it should commute with the rate control handler, but even so,
wouldn't it make more sense to have rate control late? Assuming the
packets are queued for some amount of time, having rate control
information queued with them would get stale.

Similarly, it seems to me that checking the control port protocol later
(or perhaps duplicating that?) would be a good idea?


> +/*
> + * Can be called while the sta lock is held. Anything that can cause
> packets to
> + * be generated will cause deadlock!
> + */
> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data
> *sdata,
> +				       struct sta_info *sta, u8
> pn_offs,
> +				       struct ieee80211_key *key,
> +				       struct sk_buff *skb)

That should be a void function now, you never check the return value
and only return true anyway.

> +	struct ieee80211_tx_info *info;
> +	struct ieee80211_tx_data tx;
> +	ieee80211_tx_result r;
> +

nit: extra blank line

>  	spin_lock_bh(&fq->lock);
>  
>  	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>  		goto out;
>  
> +	/* Make sure fragments stay together. */
> +	skb = __skb_dequeue(&txqi->frags);
> +	if (skb)
> +		goto out;
> +
> +begin:

I guess now that you introduced that anyway, we should consider making
the skb_linearize() failure go there. Should be a follow-up patch
though.

> +	/*
> +	 * The key can be removed while the packet was queued, so
> need to call
> +	 * this here to get the current key.
> +	 */
> +	r = ieee80211_tx_h_select_key(&tx);
> +	if (r != TX_CONTINUE) {
> +		ieee80211_free_txskb(&local->hw, skb);
> +		goto begin;
> +	}
> +
> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {

It's a bit unfortunate that you lose fast-xmit here completely for the
key stuff, but I don't see a good way to avoid that, other than
completely rejiggering all the (possibly affected) queues when keys
change... might be very complex to do that, certainly a follow-up patch
if it's desired.

This check seems a bit weird though - how could fast-xmit be set
without a TXQ station?

> +++ b/net/mac80211/util.c
> @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
> ieee80211_txq *txq,
>  			     unsigned long *byte_cnt)
>  {
>  	struct txq_info *txqi = to_txq_info(txq);
> +	u32 frag_cnt = 0, frag_bytes = 0;
> +	struct sk_buff *skb;
> +
> +	skb_queue_walk(&txqi->frags, skb) {
> +		frag_cnt++;
> +		frag_bytes += skb->len;
> +	}

I hope this is called infrequently :)

johannes
Toke Høiland-Jørgensen Sept. 30, 2016, 12:39 p.m. UTC | #2
Johannes Berg <johannes@sipsolutions.net> writes:

> Hi Toke,
>
> Sorry for the delay reviewing this.
>
> I think I still have a few comments/questions.

No worries. And not terribly surprised ;)

>> +static inline bool txq_has_queue(struct ieee80211_txq *txq)
>> +{
>> +	struct txq_info *txqi = to_txq_info(txq);
>> +	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
>> +}
>
> Tiny nit - there should probably be a blank line between the two lines
> here, but I could just fix that when I apply if you don't resend anyway
> for some other reason.

Noted.

> [snip helper stuff that looks fine]
>
>> -	if (!tx->sta->sta.txq[0])
>> -		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
>> +	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
>
> Just to make sure I get this right - this is because the handler is now
> run on dequeue, so the special case is no longer needed?

Yup. The same change is made in xmit_fast (but obscured by the moving of
the surrounding code into _finish()).

>>  #define CALL_TXH(txh) \
>> @@ -1656,6 +1684,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>>  	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>>  		CALL_TXH(ieee80211_tx_h_rate_ctrl);
>
> Just for reference - the code block here that's unchanged contains
> this:
>
>         CALL_TXH(ieee80211_tx_h_dynamic_ps);
>         CALL_TXH(ieee80211_tx_h_check_assoc);
>         CALL_TXH(ieee80211_tx_h_ps_buf);
>         CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
>         CALL_TXH(ieee80211_tx_h_select_key);
>         if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>                 CALL_TXH(ieee80211_tx_h_rate_ctrl);
>
>> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
>> +{
>> +	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
>> +	ieee80211_tx_result res = TX_CONTINUE;
>> +
>>  	if (unlikely(info->flags &
>> IEEE80211_TX_INTFL_RETRANSMISSION)) {
>>  		__skb_queue_tail(&tx->skbs, tx->skb);
>>  		tx->skb = NULL;
>
> And this code here is also unchanged from the original TX handler
> invocation, so contains this:
>
>         if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
>                 __skb_queue_tail(&tx->skbs, tx->skb);
>                 tx->skb = NULL;
>                 goto txh_done;
>         }
>
>         CALL_TXH(ieee80211_tx_h_michael_mic_add);
>         CALL_TXH(ieee80211_tx_h_sequence);
>         CALL_TXH(ieee80211_tx_h_fragment);
>         /* handlers after fragment must be aware of tx info fragmentation! */
>         CALL_TXH(ieee80211_tx_h_stats);
>         CALL_TXH(ieee80211_tx_h_encrypt);
>         if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>                 CALL_TXH(ieee80211_tx_h_calculate_duration);
>
> But now you have a problem (that you solved) that the key pointer can
> be invalidated while you have the packet queued between the two points,
> and then the tx_h_michael_mic_add and/or tx_h_encrypt would crash.
>
> You solve this by re-running tx_h_select_key() on dequeue, but it's not
> clear to me why you didn't move that to the late handlers instead?

Because I need to run it anyway for the xmit_fast path on dequeue. I
thought doing it this way simplifies the code (at the cost of the
handler getting called twice when xmit_fast is not active).

> I *think* it should commute with the rate control handler, but even
> so, wouldn't it make more sense to have rate control late? Assuming
> the packets are queued for some amount of time, having rate control
> information queued with them would get stale.

Yes, having rate control run at dequeue would be good, and that's what I
did initially. However, I found that this would lead to a deadlock
because the rate control handler would send out packets in some cases (I
forget the details but can go back and check if needed). And since the
dequeue function is called with the driver TXQ lock held, that would
lead to a deadlock when those packets made it to the driver TX path.

So I decided to just keep it this way for now; I plan to go poking into
the rate controller later anyway, so moving the handler to later could
be part of that.

> Similarly, it seems to me that checking the control port protocol later
> (or perhaps duplicating that?) would be a good idea?

But that handler only sets a few flags? Is
tx->sdata->control_port_protocol likely to change while the packet is
queued?

>> +/*
>> + * Can be called while the sta lock is held. Anything that can cause
>> packets to
>> + * be generated will cause deadlock!
>> + */
>> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data
>> *sdata,
>> +				       struct sta_info *sta, u8
>> pn_offs,
>> +				       struct ieee80211_key *key,
>> +				       struct sk_buff *skb)
>
> That should be a void function now, you never check the return value
> and only return true anyway.

Noted.

>> +	struct ieee80211_tx_info *info;
>> +	struct ieee80211_tx_data tx;
>> +	ieee80211_tx_result r;
>> +
>
> nit: extra blank line

The horror ;) (thought I got rid of all those; ah well, will fix)
>
>>  	spin_lock_bh(&fq->lock);
>>  
>>  	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>>  		goto out;
>>  
>> +	/* Make sure fragments stay together. */
>> +	skb = __skb_dequeue(&txqi->frags);
>> +	if (skb)
>> +		goto out;
>> +
>> +begin:
>
> I guess now that you introduced that anyway, we should consider making
> the skb_linearize() failure go there. Should be a follow-up patch
> though.

Can do.

>
>> +	/*
>> +	 * The key can be removed while the packet was queued, so
>> need to call
>> +	 * this here to get the current key.
>> +	 */
>> +	r = ieee80211_tx_h_select_key(&tx);
>> +	if (r != TX_CONTINUE) {
>> +		ieee80211_free_txskb(&local->hw, skb);
>> +		goto begin;
>> +	}
>> +
>> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>
> It's a bit unfortunate that you lose fast-xmit here completely for the
> key stuff, but I don't see a good way to avoid that, other than
> completely rejiggering all the (possibly affected) queues when keys
> change... might be very complex to do that, certainly a follow-up
> patch if it's desired.

Yeah, figured it was better to have something that's correct and then go
back and change it if the performance hit turns out to be too high.

> This check seems a bit weird though - how could fast-xmit be set
> without a TXQ station?

I think that is probably just left over from before I introduced the
control flag. Should be fine to remove it.

>> +++ b/net/mac80211/util.c
>> @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
>> ieee80211_txq *txq,
>>  			     unsigned long *byte_cnt)
>>  {
>>  	struct txq_info *txqi = to_txq_info(txq);
>> +	u32 frag_cnt = 0, frag_bytes = 0;
>> +	struct sk_buff *skb;
>> +
>> +	skb_queue_walk(&txqi->frags, skb) {
>> +		frag_cnt++;
>> +		frag_bytes += skb->len;
>> +	}
>
> I hope this is called infrequently :)

Well, ath10k is the only user. It does get called on each wake_tx_queue,
though, so not that infrequently. My reasoning was that since the frags
queue is never going to have more than a fairly small number of packets
in it (those produced from a single split packet), counting this way is
acceptable instead of keeping a state variable up to date. Can change it
if you disagree :)


Not sure if you want a v10, or if you're satisfied with the above
comments and will just fix up the nits on merging?

-Toke
Johannes Berg Sept. 30, 2016, 12:43 p.m. UTC | #3
> Because I need to run it anyway for the xmit_fast path on dequeue. I
> thought doing it this way simplifies the code (at the cost of the
> handler getting called twice when xmit_fast is not active).

Ok, that's fair.

> > I *think* it should commute with the rate control handler, but even
> > so, wouldn't it make more sense to have rate control late? Assuming
> > the packets are queued for some amount of time, having rate control
> > information queued with them would get stale.
> 
> Yes, having rate control run at dequeue would be good, and that's
> what I did initially. However, I found that this would lead to a
> deadlock because the rate control handler would send out packets in
> some cases (I forget the details but can go back and check if
> needed). And since the dequeue function is called with the driver TXQ
> lock held, that would lead to a deadlock when those packets made it
> to the driver TX path.

That seems really odd, but I can see how a deadlock happens then.

> So I decided to just keep it this way for now; I plan to go poking
> into the rate controller later anyway, so moving the handler to later
> could be part of that.

Sure, that's fair.

> But that handler only sets a few flags? Is
> tx->sdata->control_port_protocol likely to change while the packet is
> queued?

Oh right, I confused things there. We check the controlled port much
earlier, but anyway that should be OK.

> > It's a bit unfortunate that you lose fast-xmit here completely for
> > the key stuff, but I don't see a good way to avoid that, other than
> > completely rejiggering all the (possibly affected) queues when keys
> > change... might be very complex to do that, certainly a follow-up
> > patch if it's desired.
> 
> Yeah, figured it was better to have something that's correct and then
> go back and change it if the performance hit turns out to be too
> high.

Makes sense.

> > This check seems a bit weird though - how could fast-xmit be set
> > without a TXQ station?
> 
> I think that is probably just left over from before I introduced the
> control flag. Should be fine to remove it.

Ok.

> > 
> > > 
> > > +++ b/net/mac80211/util.c
> > > @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
> > > ieee80211_txq *txq,
> > >  			     unsigned long *byte_cnt)
> > >  {
> > >  	struct txq_info *txqi = to_txq_info(txq);
> > > +	u32 frag_cnt = 0, frag_bytes = 0;
> > > +	struct sk_buff *skb;
> > > +
> > > +	skb_queue_walk(&txqi->frags, skb) {
> > > +		frag_cnt++;
> > > +		frag_bytes += skb->len;
> > > +	}
> > 
> > I hope this is called infrequently :)
> 
> Well, ath10k is the only user. It does get called on each
> wake_tx_queue, though, so not that infrequently. My reasoning was
> that since the frags queue is never going to have more than a fairly
> small number of packets in it (those produced from a single split
> packet), counting this way is acceptable instead of keeping a state
> variable up to date. Can change it if you disagree :)

No, I guess you're right, it can't be a long queue.

> Not sure if you want a v10, or if you're satisfied with the above
> comments and will just fix up the nits on merging?
> 

I'll fix it up. Thanks!

johannes
Toke Høiland-Jørgensen Sept. 30, 2016, 12:45 p.m. UTC | #4
Johannes Berg <johannes@sipsolutions.net> writes:

>> Not sure if you want a v10, or if you're satisfied with the above
>> comments and will just fix up the nits on merging?
>> 
>
> I'll fix it up. Thanks!

Cool, thanks :)

-Toke
Johannes Berg Sept. 30, 2016, 12:49 p.m. UTC | #5
Applied, with the nits fixed as discussed.

Come to think of it, if somebody is bored ;-) perhaps a hwsim option to
use TXQs (should be optional I guess) would be nice so we can exercise
this code with the wpa_supplicant hwsim tests. That would have caught
the TKIP issues etc. pretty early on too, I think.

johannes
Toke Høiland-Jørgensen Sept. 30, 2016, 2:01 p.m. UTC | #6
Johannes Berg <johannes@sipsolutions.net> writes:

> Applied, with the nits fixed as discussed.

Awesome, thanks!

> Come to think of it, if somebody is bored ;-) perhaps a hwsim option
> to use TXQs (should be optional I guess) would be nice so we can
> exercise this code with the wpa_supplicant hwsim tests. That would
> have caught the TKIP issues etc. pretty early on too, I think.

Noted. I'll look into that the next time I'm bored ;)

-Toke
diff mbox

Patch

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5296100..9463039 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@  enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@  enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		= BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
 };
 
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c71c735..caca265 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,12 +813,14 @@  enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin hashes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
 	struct codel_stats cstats;
+	struct sk_buff_head frags;
 	unsigned long flags;
 
 	/* keep last! */
@@ -1481,6 +1483,12 @@  static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq)
 	return container_of(txq, struct txq_info, txq);
 }
 
+static inline bool txq_has_queue(struct ieee80211_txq *txq)
+{
+	struct txq_info *txqi = to_txq_info(txq);
+	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
+}
+
 static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
 {
 	return ether_addr_equal(raddr, addr) ||
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e796060..ae5786b8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1323,9 +1323,7 @@  static void sta_ps_start(struct sta_info *sta)
 		return;
 
 	for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
-		struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
-
-		if (txqi->tin.backlog_packets)
+		if (txq_has_queue(sta->sta.txq[tid]))
 			set_bit(tid, &sta->txq_buffered_tids);
 		else
 			clear_bit(tid, &sta->txq_buffered_tids);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 1b1b28f..167bff0 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1212,12 +1212,10 @@  void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
 
 	if (sta->sta.txq[0]) {
 		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
-			struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
-
-			if (!txqi->tin.backlog_packets)
+			if (!txq_has_queue(sta->sta.txq[i]))
 				continue;
 
-			drv_wake_tx_queue(local, txqi);
+			drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
 		}
 	}
 
@@ -1649,9 +1647,7 @@  ieee80211_sta_ps_deliver_response(struct sta_info *sta,
 			return;
 
 		for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
-			struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
-
-			if (!(tids & BIT(tid)) || txqi->tin.backlog_packets)
+			if (!(tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid]))
 				continue;
 
 			sta_info_recalc_tim(sta);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e8c9964..75e6adf 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -853,8 +853,7 @@  ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
 
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
 
 	return TX_CONTINUE;
 }
@@ -1404,6 +1403,7 @@  void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
 	codel_stats_init(&txqi->cstats);
+	__skb_queue_head_init(&txqi->frags);
 
 	txqi->txq.vif = &sdata->vif;
 
@@ -1426,6 +1426,7 @@  void ieee80211_txq_purge(struct ieee80211_local *local,
 	struct fq_tin *tin = &txqi->tin;
 
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1476,6 +1477,47 @@  void ieee80211_txq_teardown_flows(struct ieee80211_local *local)
 	spin_unlock_bh(&fq->lock);
 }
 
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct fq *fq = &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type == NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta = &sta->sta;
+	else
+		pubsta = NULL;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif = &sdata->vif;
+	txqi = ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif = vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1483,9 +1525,7 @@  static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control = {};
-	struct fq *fq = &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1500,21 +1540,6 @@  static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
-		txqi = ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif = vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1635,10 +1660,13 @@  static bool __ieee80211_tx(struct ieee80211_local *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is everything
+ * that can be sensitive to reordering, and will be deferred to after packets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res = TX_DROP;
 
 #define CALL_TXH(txh) \
@@ -1656,6 +1684,31 @@  static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res = TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb = NULL;
@@ -1688,6 +1741,15 @@  static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	return 0;
 }
 
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r = invoke_tx_handlers_early(tx);
+
+	if (r)
+		return r;
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1762,7 +1824,13 @@  static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		info->hw_queue =
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
 
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result = __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
 
@@ -3106,8 +3174,73 @@  out:
 	return ret;
 }
 
+/*
+ * Can be called while the sta lock is held. Anything that can cause packets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
+				       struct sta_info *sta, u8 pn_offs,
+				       struct ieee80211_key *key,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	u8 tid = IEEE80211_NUM_TIDS;
+
+	if (key)
+		info->control.hw_key = &key->conf;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) = tid;
+		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number += 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
+	/* statistics normally done by ieee80211_tx_h_stats (but that
+	 * has to consider fragmentation, so is more complex)
+	 */
+	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
+	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
+
+	if (pn_offs) {
+		u64 pn;
+		u8 *crypto_hdr = skb->data + pn_offs;
+
+		switch (key->conf.cipher) {
+		case WLAN_CIPHER_SUITE_CCMP:
+		case WLAN_CIPHER_SUITE_CCMP_256:
+		case WLAN_CIPHER_SUITE_GCMP:
+		case WLAN_CIPHER_SUITE_GCMP_256:
+			pn = atomic64_inc_return(&key->conf.tx_pn);
+			crypto_hdr[0] = pn;
+			crypto_hdr[1] = pn >> 8;
+			crypto_hdr[4] = pn >> 16;
+			crypto_hdr[5] = pn >> 24;
+			crypto_hdr[6] = pn >> 32;
+			crypto_hdr[7] = pn >> 40;
+			break;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3158,8 +3291,6 @@  static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 			return true;
 	}
 
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3188,24 +3319,7 @@  static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) = tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number += 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
 
 	__skb_queue_head_init(&tx.skbs);
 
@@ -3215,9 +3329,6 @@  static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	tx.sta = sta;
 	tx.key = fast_tx->key;
 
-	if (fast_tx->key)
-		info->control.hw_key = &fast_tx->key->conf;
-
 	if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
 		tx.skb = skb;
 		r = ieee80211_tx_h_rate_ctrl(&tx);
@@ -3231,31 +3342,11 @@  static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	/* statistics normally done by ieee80211_tx_h_stats (but that
-	 * has to consider fragmentation, so is more complex)
-	 */
-	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
-	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
-
-	if (fast_tx->pn_offs) {
-		u64 pn;
-		u8 *crypto_hdr = skb->data + fast_tx->pn_offs;
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
 
-		switch (fast_tx->key->conf.cipher) {
-		case WLAN_CIPHER_SUITE_CCMP:
-		case WLAN_CIPHER_SUITE_CCMP_256:
-		case WLAN_CIPHER_SUITE_GCMP:
-		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn);
-			crypto_hdr[0] = pn;
-			crypto_hdr[1] = pn >> 8;
-			crypto_hdr[4] = pn >> 16;
-			crypto_hdr[5] = pn >> 24;
-			crypto_hdr[6] = pn >> 32;
-			crypto_hdr[7] = pn >> 40;
-			break;
-		}
-	}
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
+				   fast_tx->key, skb);
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		sdata = container_of(sdata->bss,
@@ -3275,12 +3366,22 @@  struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct sk_buff *skb = NULL;
 	struct fq *fq = &local->fq;
 	struct fq_tin *tin = &txqi->tin;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+
 
 	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
+	/* Make sure fragments stay together. */
+	skb = __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
@@ -3288,16 +3389,46 @@  struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	ieee80211_set_skb_vif(skb, txqi);
 
 	hdr = (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info = IEEE80211_SKB_CB(skb);
+
+	memset(&tx, 0, sizeof(tx));
+	__skb_queue_head_init(&tx.skbs);
+	tx.local = local;
+	tx.skb = skb;
+	tx.sdata = vif_to_sdata(info->control.vif);
+
+	if (txq->sta)
+		tx.sta = container_of(txq->sta, struct sta_info, sta);
+
+	/*
+	 * The key can be removed while the packet was queued, so need to call
+	 * this here to get the current key.
+	 */
+	r = ieee80211_tx_h_select_key(&tx);
+	if (r != TX_CONTINUE) {
+		ieee80211_free_txskb(&local->hw, skb);
+		goto begin;
+	}
+
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		u8 pn_offs = 0;
 
-		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+		if (tx.key &&
+		    (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV))
+			pn_offs = ieee80211_hdrlen(hdr->frame_control);
+
+		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
+					   tx.key, skb);
+	} else {
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb = __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
 
 out:
@@ -3335,7 +3466,7 @@  void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		fast_tx = rcu_dereference(sta->fast_tx);
 
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index b6865d8..8006f9a 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3393,11 +3393,18 @@  void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
 			     unsigned long *byte_cnt)
 {
 	struct txq_info *txqi = to_txq_info(txq);
+	u32 frag_cnt = 0, frag_bytes = 0;
+	struct sk_buff *skb;
+
+	skb_queue_walk(&txqi->frags, skb) {
+		frag_cnt++;
+		frag_bytes += skb->len;
+	}
 
 	if (frame_cnt)
-		*frame_cnt = txqi->tin.backlog_packets;
+		*frame_cnt = txqi->tin.backlog_packets + frag_cnt;
 
 	if (byte_cnt)
-		*byte_cnt = txqi->tin.backlog_bytes;
+		*byte_cnt = txqi->tin.backlog_bytes + frag_bytes;
 }
 EXPORT_SYMBOL(ieee80211_txq_get_depth);