diff mbox

[PATCHv2,1/2] mac80211: implement fair queuing per txq

Message ID 1459420104-31554-2-git-send-email-michal.kazior@tieto.com (mailing list archive)
State Changes Requested
Delegated to: Johannes Berg
Headers show

Commit Message

Michal Kazior March 31, 2016, 10:28 a.m. UTC
Qdiscs assume all packets regardless of
destination address are treated equally by
underlying an device link.

This isn't true for wireless where each node is a
link in it's own right with different and varying
signal quality over time.

Existing wireless behavior stuffs device tx queues
with no regard to link conditions. This can result
in queue buildup for slow stations and an inertia
worth of seconds making it impossible for small
bursty traffic to come through.

The current high-level idea is to keep roughly 1-2
txops worth of data in device tx queues to allow
short bursts to be handled responsively.

mac80211's software queues were designed to work
very closely with device tx queues. They are
required to make use of 802.11 packet aggregation
easily and efficiently.

However the logic imposed a per-AC queue limit.
With the limit too small mac80211 wasn't be able
to guarantee fairness across TIDs nor stations
because single burst to a slow station could
monopolize queues and reach per-AC limit
preventing traffic from other stations being
queued into mac80211's software queues. Having the
limit too large would make smart qdiscs, e.g.
fq_codel, a lot less efficient as they are
designed on the premise that they are very close
to the actualy device tx queues.

The patch implements fq_codel-ish logic in
mac80211's software queuing. This doesn't directly
translate to immediate and significant gains.
Moreover only wake_tx_queue based drivers will be
able to reap the benefits of fair queuing for now.

More work is required to make sure drivers keep
their device tx queues at minimum fill (instead of
clogging them up until they're full regardless of
link conditions). Only then the full effect of
fair queuing will be observable.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---

Notes:
    v2:
     * fix invalid ptr deref (I accidentally removed `info` ptr assignment..)
    
    v1:
     * move txq_limit and txq_cparams from ieee80211_hw to ieee80211_fq
     * remove printks
     * improve commit log
     * various cleanups
     * extra stats
     * split out the core txq fairness changes
       * should_drop() doesn't consider bursts
       * codel target is hardcoded to 20ms
    
    RFC v2:
     * actually re-use txq_flows on enqueue [Felix]
     * tune should_drop() to consider bursts wrt station expected tput [Dave/Bob]
     * make codel target time scale via ewma of estimated txqi service period [Dave]
     * generic tx scheduling (with time-based queue limit and naive hysteresis)
       * tracking per-frame expected duration
       * tracking per-txqi in-flight data duration
       * tracking per-hw in-flight data duration
       ? in-flight means scheduled to driver and assumes driver does report
         tx-status on actual tx-completion
     * added a few debugfs entries

 include/net/mac80211.h     |  21 ++-
 net/mac80211/agg-tx.c      |   8 +-
 net/mac80211/codel.h       | 264 +++++++++++++++++++++++++++++++
 net/mac80211/codel_i.h     |  89 +++++++++++
 net/mac80211/ieee80211_i.h |  45 +++++-
 net/mac80211/iface.c       |  24 ++-
 net/mac80211/main.c        |   9 +-
 net/mac80211/rx.c          |   2 +-
 net/mac80211/sta_info.c    |  10 +-
 net/mac80211/sta_info.h    |  27 ++++
 net/mac80211/tx.c          | 377 ++++++++++++++++++++++++++++++++++++++++-----
 net/mac80211/util.c        |  20 ++-
 12 files changed, 817 insertions(+), 79 deletions(-)
 create mode 100644 net/mac80211/codel.h
 create mode 100644 net/mac80211/codel_i.h

Comments

Johannes Berg April 5, 2016, 1:57 p.m. UTC | #1
On Thu, 2016-03-31 at 12:28 +0200, Michal Kazior wrote:

> +++ b/net/mac80211/codel.h
> +++ b/net/mac80211/codel_i.h

Do we really need all this code in .h files? It seems very odd to me to
have all the algorithm implementation there rather than a C file, you
should (can?) only include codel.h into a single C file anyway.

>  struct txq_info {
> -	struct sk_buff_head queue;
> +	struct txq_flow flow;
> +	struct list_head new_flows;
> +	struct list_head old_flows;

This is confusing, can you please document that? Why are there two
lists of flows, *and* an embedded flow? Is the embedded flow on any of
the lists?

> +	u32 backlog_bytes;
> +	u32 backlog_packets;
> +	u32 drop_codel;

Would it make some sense to at least conceptually layer this a bit?
I.e. rather than calling this "drop_codel" call it "drop_congestion" or
something like that?

> @@ -977,12 +978,9 @@ static void ieee80211_do_stop(struct
> ieee80211_sub_if_data *sdata,
>  	if (sdata->vif.txq) {
>  		struct txq_info *txqi = to_txq_info(sdata->vif.txq);
>  
> -		spin_lock_bh(&txqi->queue.lock);
> -		ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
> -		txqi->byte_cnt = 0;
> -		spin_unlock_bh(&txqi->queue.lock);
> -
> -		atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
> +		spin_lock_bh(&fq->lock);
> +		ieee80211_purge_txq(local, txqi);
> +		spin_unlock_bh(&fq->lock);

This isn't very nice - you're going from locking a single txqi to
having a global hardware lock.

It's probably fine in this particular case, but I'll need to look for
other places :)

> +/**
> + * struct txq_flow - per traffic flow queue
> + *
> + * This structure is used to distinguish and queue different traffic flows
> + * separately for fair queueing/AQM purposes.
> + *
> + * @txqi: txq_info structure it is associated at given time

Do we actually have to keep that? It's on a list per txqi, no?

> + * @flowchain: can be linked to other flows for RR purposes

RR?

> +void ieee80211_teardown_flows(struct ieee80211_local *local)
> +{
> +	struct ieee80211_fq *fq = &local->fq;
> +	struct ieee80211_sub_if_data *sdata;
> +	struct sta_info *sta;
> +	int i;
> +
> +	if (!local->ops->wake_tx_queue)
> +		return;
> +
> +	list_for_each_entry_rcu(sta, &local->sta_list, list)
> +		for (i = 0; i < IEEE80211_NUM_TIDS; i++)
> +			ieee80211_purge_txq(local,
> +					    to_txq_info(sta->sta.txq[i]));
> +
> +	list_for_each_entry_rcu(sdata, &local->interfaces, list)
> +		ieee80211_purge_txq(local, to_txq_info(sdata->vif.txq));

Using RCU iteration here seems rather strange, since it's a teardown
flow? That doesn't seem necessary, since it's control path and must be
holding appropriate locks anyway to make sure nothing is added to the
lists.

> +	skb = codel_dequeue(flow,
> +			    &flow->backlog,
> +			    0,
> +			    &flow->cvars,
> +			    &fq->cparams,
> +			    codel_get_time(),
> +			    false);

What happened here? :)

> +	if (!skb) {
> +		if ((head == &txqi->new_flows) &&
> +		    !list_empty(&txqi->old_flows)) {
> +			list_move_tail(&flow->flowchain, &txqi->old_flows);
> +		} else {
> +			list_del_init(&flow->flowchain);
> +			flow->txqi = NULL;
> +		}
> +		goto begin;
> +	}

Ouch. Any way you can make that easier to follow?

johannes
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Taht April 5, 2016, 2:32 p.m. UTC | #2
thx for the review!

On Tue, Apr 5, 2016 at 6:57 AM, Johannes Berg <johannes@sipsolutions.net> wrote:
> On Thu, 2016-03-31 at 12:28 +0200, Michal Kazior wrote:
>
>> +++ b/net/mac80211/codel.h
>> +++ b/net/mac80211/codel_i.h
>
> Do we really need all this code in .h files? It seems very odd to me to
> have all the algorithm implementation there rather than a C file, you
> should (can?) only include codel.h into a single C file anyway.

The hope had been the original codel.h would have been reusable, which
is not the case at present.

>
>>  struct txq_info {
>> -     struct sk_buff_head queue;
>> +     struct txq_flow flow;
>> +     struct list_head new_flows;
>> +     struct list_head old_flows;
>
> This is confusing, can you please document that? Why are there two
> lists of flows, *and* an embedded flow? Is the embedded flow on any of
> the lists?

To explain the new and old flow concepts, there's
https://tools.ietf.org/html/draft-ietf-aqm-fq-codel-06 which is in the
ietf editors queue for final publication and doesn't have a final name
yet.

The embedded flow concept is michal's and I'm not convinced it's the
right idea as yet.

>
>> +     u32 backlog_bytes;
>> +     u32 backlog_packets;
>> +     u32 drop_codel;
>
> Would it make some sense to at least conceptually layer this a bit?
> I.e. rather than calling this "drop_codel" call it "drop_congestion" or
> something like that?

Is there a more generic place overall in ieee80211 to record per-sta
backlogs, drops and marks?

>> +     skb = codel_dequeue(flow,
>> +                         &flow->backlog,
>> +                         0,
>> +                         &flow->cvars,
>> +                         &fq->cparams,
>> +                         codel_get_time(),
>> +                         false);
>
> What happened here? :)

Magic.

>
>> +     if (!skb) {
>> +             if ((head == &txqi->new_flows) &&
>> +                 !list_empty(&txqi->old_flows)) {
>> +                     list_move_tail(&flow->flowchain, &txqi->old_flows);
>> +             } else {
>> +                     list_del_init(&flow->flowchain);
>> +                     flow->txqi = NULL;
>> +             }
>> +             goto begin;
>> +     }
>
> Ouch. Any way you can make that easier to follow?

It made my brain hurt in the original code, too, but it is eric
optimizing out cycles at his finest.

if the the new_flows list is expired or done, switch to the old_flows
list, if the old_flows list is done, go try selecting another queue to
pull from (which may or may not exist). see the pending rfc for a more
elongated version.

>
> johannes
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Kazior April 6, 2016, 5:35 a.m. UTC | #3
On 5 April 2016 at 15:57, Johannes Berg <johannes@sipsolutions.net> wrote:
> On Thu, 2016-03-31 at 12:28 +0200, Michal Kazior wrote:
>
>> +++ b/net/mac80211/codel.h
>> +++ b/net/mac80211/codel_i.h
>
> Do we really need all this code in .h files? It seems very odd to me to
> have all the algorithm implementation there rather than a C file, you
> should (can?) only include codel.h into a single C file anyway.

I just wanted to follow the suggested/implied usage of codel code and
keep modifications to a minimum. I could very well just assimilate it
if you wish.


>>  struct txq_info {
>> -     struct sk_buff_head queue;
>> +     struct txq_flow flow;
>> +     struct list_head new_flows;
>> +     struct list_head old_flows;
>
> This is confusing, can you please document that? Why are there two
> lists of flows, *and* an embedded flow? Is the embedded flow on any of
> the lists?

The new/old flows is follows the same principle as net/sched/sch_fq_codel.c

The embedded flow is for possible collisions, explained below.

Nevertheless I'll add more comments on what-is-what-and-why.


>> +     u32 backlog_bytes;
>> +     u32 backlog_packets;
>> +     u32 drop_codel;
>
> Would it make some sense to at least conceptually layer this a bit?
> I.e. rather than calling this "drop_codel" call it "drop_congestion" or
> something like that?

Sure, I'll change it.


>> +/**
>> + * struct txq_flow - per traffic flow queue
>> + *
>> + * This structure is used to distinguish and queue different traffic flows
>> + * separately for fair queueing/AQM purposes.
>> + *
>> + * @txqi: txq_info structure it is associated at given time
>
> Do we actually have to keep that? It's on a list per txqi, no?

It's used to track ownership.

Packets can be destined to different stations/txqs. At enqueue time I
do a partial hash of a packet to get an "index" which I then use to
address a txq_flow from per-radio list (out of 4096 of them). You can
end up with a situtation like this:
 - packet A hashing to X destined to txq P which is VI
 - packet B hashing to X destined to txq Q which is BK

You can't use the same txq_flow for both A and B because you want to
maintain packets per txqs more than you want to maintain them per flow
(you don't want to queue BK traffic onto VI or vice versa as an
artifact, do you? ;). When a txq_flow doesn't have a txqi it is bound.
Later, if a collision happens (i.e. resulting txq_flow has non-NULL
txqi) the "embedded" per-txq flow is used:

  struct txq_info {
 -     struct sk_buff_head queue;
 +     struct txq_flow flow; // <--- this

When txq_flow becomes empty its txqi is reset.

The embedded flow is otherwise treated like any other flow, i.e. it
can be linked to old_flows and new_flows.


>> + * @flowchain: can be linked to other flows for RR purposes
>
> RR?

Round-robin. Assuming it's correct to call fq_codel an RR scheme?



>> +void ieee80211_teardown_flows(struct ieee80211_local *local)
>> +{
>> +     struct ieee80211_fq *fq = &local->fq;
>> +     struct ieee80211_sub_if_data *sdata;
>> +     struct sta_info *sta;
>> +     int i;
>> +
>> +     if (!local->ops->wake_tx_queue)
>> +             return;
>> +
>> +     list_for_each_entry_rcu(sta, &local->sta_list, list)
>> +             for (i = 0; i < IEEE80211_NUM_TIDS; i++)
>> +                     ieee80211_purge_txq(local,
>> +                                         to_txq_info(sta->sta.txq[i]));
>> +
>> +     list_for_each_entry_rcu(sdata, &local->interfaces, list)
>> +             ieee80211_purge_txq(local, to_txq_info(sdata->vif.txq));
>
> Using RCU iteration here seems rather strange, since it's a teardown
> flow? That doesn't seem necessary, since it's control path and must be
> holding appropriate locks anyway to make sure nothing is added to the
> lists.

You're probably right. I'll look into changing it.


>
>> +     skb = codel_dequeue(flow,
>> +                         &flow->backlog,
>> +                         0,
>> +                         &flow->cvars,
>> +                         &fq->cparams,
>> +                         codel_get_time(),
>> +                         false);
>
> What happened here? :)

I'm not a huge fan of wrapping functions with a lot of (ugly-looking)
arguments. I can make it a different ugly if you want :)


>> +     if (!skb) {
>> +             if ((head == &txqi->new_flows) &&
>> +                 !list_empty(&txqi->old_flows)) {
>> +                     list_move_tail(&flow->flowchain, &txqi->old_flows);
>> +             } else {
>> +                     list_del_init(&flow->flowchain);
>> +                     flow->txqi = NULL;
>> +             }
>> +             goto begin;
>> +     }
>
> Ouch. Any way you can make that easier to follow?

This follows net/sched/sch_fq_codel.h. I can put up a comment to
explain what it's supposed to do?


Micha?
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jonathan Morton April 6, 2016, 6:03 a.m. UTC | #4
> On 6 Apr, 2016, at 08:35, Michal Kazior <michal.kazior@tieto.com> wrote:
> 
> Packets can be destined to different stations/txqs. At enqueue time I
> do a partial hash of a packet to get an "index" which I then use to
> address a txq_flow from per-radio list (out of 4096 of them). You can
> end up with a situtation like this:
> - packet A hashing to X destined to txq P which is VI
> - packet B hashing to X destined to txq Q which is BK
> 
> You can't use the same txq_flow for both A and B because you want to
> maintain packets per txqs more than you want to maintain them per flow
> (you don't want to queue BK traffic onto VI or vice versa as an
> artifact, do you? ;). When a txq_flow doesn't have a txqi it is bound.
> Later, if a collision happens (i.e. resulting txq_flow has non-NULL
> txqi) the "embedded" per-txq flow is used:
> 
>  struct txq_info {
> -     struct sk_buff_head queue;
> +     struct txq_flow flow; // <--- this
> 
> When txq_flow becomes empty its txqi is reset.
> 
> The embedded flow is otherwise treated like any other flow, i.e. it
> can be linked to old_flows and new_flows.

This smells like a very fragile and complex solution to the collision problem.  You may want to look at how Cake solves it.

I use a separate pool of flows per traffic class (essentially, VO/VI/BE/BK), and there is also a set-associative hash to take care of the birthday problem.  The latter has an order-of-magnitude effect on the general flow collision rate once you get into the tens of flows, for very little CPU cost.

 - Jonathan Morton

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Kazior April 6, 2016, 7:16 a.m. UTC | #5
On 6 April 2016 at 08:03, Jonathan Morton <chromatix99@gmail.com> wrote:
>
>> On 6 Apr, 2016, at 08:35, Michal Kazior <michal.kazior@tieto.com> wrote:
>>
>> Packets can be destined to different stations/txqs. At enqueue time I
>> do a partial hash of a packet to get an "index" which I then use to
>> address a txq_flow from per-radio list (out of 4096 of them). You can
>> end up with a situtation like this:
>> - packet A hashing to X destined to txq P which is VI
>> - packet B hashing to X destined to txq Q which is BK
>>
>> You can't use the same txq_flow for both A and B because you want to
>> maintain packets per txqs more than you want to maintain them per flow
>> (you don't want to queue BK traffic onto VI or vice versa as an
>> artifact, do you? ;). When a txq_flow doesn't have a txqi it is bound.
>> Later, if a collision happens (i.e. resulting txq_flow has non-NULL
>> txqi) the "embedded" per-txq flow is used:
>>
>>  struct txq_info {
>> -     struct sk_buff_head queue;
>> +     struct txq_flow flow; // <--- this
>>
>> When txq_flow becomes empty its txqi is reset.
>>
>> The embedded flow is otherwise treated like any other flow, i.e. it
>> can be linked to old_flows and new_flows.
>
> This smells like a very fragile and complex solution to the collision problem.  You may want to look at how Cake solves it.
>
> I use a separate pool of flows per traffic class (essentially, VO/VI/BE/BK), and there is also a set-associative hash to take care of the birthday problem.  The latter has an order-of-magnitude effect on the general flow collision rate once you get into the tens of flows, for very little CPU cost.

When a driver asks mac80211 to dequeue given txq it implies a
destination station as well. This is important because 802.11
aggregation can be performed only on groups of packets going to a
single station on a single tid.

Cake - as I understand it - doesn't really *guarantee* maintaining
this. Keep in mind you can run with hundreds of stations connected.

You don't really want to burden drivers with sorting this grouping up
themselves (and hence coerce them into introducing another level of
intermediate queues, bis).

Without the per-txq fallback flow (regardless of using fq_codel-like
scheme or cake-like scheme in mac80211) you'll need to modify
codel_dequeue() itself to compensate and re-queue/skip frames not
belonging to requested txq.

I'm not sure it's worth it for initial fair-queuing implementation.


Micha?
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Johannes Berg April 6, 2016, 7:19 a.m. UTC | #6
On Wed, 2016-04-06 at 07:35 +0200, Michal Kazior wrote:

> I just wanted to follow the suggested/implied usage of codel code and
> keep modifications to a minimum. I could very well just assimilate it
> if you wish.

I don't really feel all that strongly about it, but I also don't see
the point. It makes it harder to look for the code though, and that
seems fairly pointless.

Btw, just realized that there's also __u32 in there which you should
probably remove and use just u32. Also don't #include <linux/version.h>


> This follows net/sched/sch_fq_codel.h. I can put up a comment to
> explain what it's supposed to do?
> 

Ok, fair enough.

johannes
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Johannes Berg April 6, 2016, 7:21 a.m. UTC | #7
[removing other lists since they spam me with moderation bounces]

> The hope had been the original codel.h would have been reusable,
> which is not the case at present.

So what's the strategy for making it happen? Unless there is one, I
don't see the point in making the code more complicated than it already
has to be anyway.

johannes
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jonathan Morton April 6, 2016, 4:46 p.m. UTC | #8
> On 6 Apr, 2016, at 10:16, Michal Kazior <michal.kazior@tieto.com> wrote:
> 
> When a driver asks mac80211 to dequeue given txq it implies a
> destination station as well. This is important because 802.11
> aggregation can be performed only on groups of packets going to a
> single station on a single tid.
> 
> Cake - as I understand it - doesn't really *guarantee* maintaining
> this. Keep in mind you can run with hundreds of stations connected.
> 
> You don't really want to burden drivers with sorting this grouping up
> themselves (and hence coerce them into introducing another level of
> intermediate queues, bis).

Well, no.  Cake isn’t designed to maintain per-station queues explicitly, though it does have support for stochastic fairness between hosts.  It is also blissfuly unaware of the requirements of wifi aggregation, largely because the standard qdisc interface is likewise ignorant.  I’m therefore not suggesting that you use Cake as-is.

What I’m pointing at instead is the set-associative hash, which could easily be tweaked to put greater emphasis on avoiding putting multiple stations’ traffic in one queue, while maintaining the performance benefits of a fixed queue pool indexed by a hash table, and an extended operating region in which flow isolation is maintained.  You can then have a linked-list of queues assigned to a particular station, so that when a packet for a particular station is requested, you can easily locate one.

I hadn’t appreciated, though, that the TXQ struct was station-specific.  This wasn’t obvious from the code fragments posted, so it looked like packets that incurred hash collisions would be dumped into a single overflow queue.

 - Jonathan Morton

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Taht April 6, 2016, 5:39 p.m. UTC | #9
On Wed, Apr 6, 2016 at 12:21 AM, Johannes Berg
<johannes@sipsolutions.net> wrote:
> [removing other lists since they spam me with moderation bounces]

I have added your email address be accepted to the codel,
make-wifi-fast lists. My apologies for the bounces.

The people on those lists generally do not have the time to tackle the
volume of traffic on linux-wireless.

>> The hope had been the original codel.h would have been reusable,
>> which is not the case at present.
>
> So what's the strategy for making it happen?

Strategy? to meander towards a result that gives low latency to all
stations, no matter their bandwidth, on several chipsets.

The holy grail from my viewpoint is to get airtime fairness, better
mac utilization, slow stations not starving fast ones, more stations
servicable, and so on, and my focus has generally been on having an
architecture that applied equally to APs and clients. Getting clients
alone to have a queuing latency reduction of these orders of magnitude
on uploads at low rates would be a huge win, but not the holy grail.

It was really nice to have michal's proof of concept(s) show up and
show fq_codel-like benefits at both low and high speeds on wifi, but
it is clear more architectural rework is required to fit the theory
into the reality.

> Unless there is one, I
> don't see the point in making the code more complicated than it already
> has to be anyway.

+1.

Next steps were - get toke's and my testbeds up - avery/tim/myself to
keep hammering at the ath9k - michal exploring dql - jonathon poking
at it with cake-like ideas - and anyone else that cares to join in on
finally fixing bufferbloat on wifi.

and maybe put together a videoconference in 2-3 weeks or so with where
we are stuck at (felix will be off vacation, too, I think). There are
still multiple points where we all talk past each other.

Me, for example, am overly fixated on having a per station queue to
start with (which in the case of a client is two stations - one
multicast/mgtmt frames and regular traffic) and not dealing with tids
until much later in the process. Unfortunately it seems the hook is
very late in the process.
>
> johannes
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Johannes Berg April 7, 2016, 8:53 a.m. UTC | #10
On Wed, 2016-04-06 at 10:39 -0700, Dave Taht wrote:

> > > The hope had been the original codel.h would have been reusable,
> > > which is not the case at present.
> > So what's the strategy for making it happen?
> Strategy? to meander towards a result that gives low latency to all
> stations, no matter their bandwidth, on several chipsets.

I meant "strategy for making the code reusable". Or something like that
anyway. I don't see the point in trying and then failing. Here we're
adding a completely different version of codel to the kernel - why?
What makes this version unusable for the original usage in
include/net/codel.h? Can't we replace that one with the newer version
and actually use the same file here?

Or - why bother with the header file to make it shareable, if we're not
even attempting to do that?

johannes
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a53333cb1528..0ee51dbb361b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -888,8 +888,18 @@  struct ieee80211_tx_info {
 				/* only needed before rate control */
 				unsigned long jiffies;
 			};
-			/* NB: vif can be NULL for injected frames */
-			struct ieee80211_vif *vif;
+			union {
+				/* NB: vif can be NULL for injected frames */
+				struct ieee80211_vif *vif;
+
+				/* When packets are enqueued on txq it's easy
+				 * to re-construct the vif pointer. There's no
+				 * more space in tx_info so it can be used to
+				 * store the necessary enqueue time for packet
+				 * sojourn time computation.
+				 */
+				u64 enqueue_time;
+			};
 			struct ieee80211_key_conf *hw_key;
 			u32 flags;
 			/* 4 bytes free */
@@ -2113,9 +2123,6 @@  enum ieee80211_hw_flags {
  * @n_cipher_schemes: a size of an array of cipher schemes definitions.
  * @cipher_schemes: a pointer to an array of cipher scheme definitions
  *	supported by HW.
- *
- * @txq_ac_max_pending: maximum number of frames per AC pending in all txq
- *	entries for a vif.
  */
 struct ieee80211_hw {
 	struct ieee80211_conf conf;
@@ -2145,7 +2152,6 @@  struct ieee80211_hw {
 	u8 uapsd_max_sp_len;
 	u8 n_cipher_schemes;
 	const struct ieee80211_cipher_scheme *cipher_schemes;
-	int txq_ac_max_pending;
 };
 
 static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw,
@@ -5633,6 +5639,9 @@  struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
  * txq state can change half-way of this function and the caller may end up
  * with "new" frame_cnt and "old" byte_cnt or vice-versa.
  *
+ * Moreover returned values are best-case, i.e. assuming queueing algorithm
+ * will not drop frames due to excess latency.
+ *
  * @txq: pointer obtained from station or virtual interface
  * @frame_cnt: pointer to store frame count
  * @byte_cnt: pointer to store byte count
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 4932e9f243a2..b9d0cee2a786 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -194,17 +194,21 @@  static void
 ieee80211_agg_stop_txq(struct sta_info *sta, int tid)
 {
 	struct ieee80211_txq *txq = sta->sta.txq[tid];
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_fq *fq;
 	struct txq_info *txqi;
 
 	if (!txq)
 		return;
 
 	txqi = to_txq_info(txq);
+	sdata = vif_to_sdata(txq->vif);
+	fq = &sdata->local->fq;
 
 	/* Lock here to protect against further seqno updates on dequeue */
-	spin_lock_bh(&txqi->queue.lock);
+	spin_lock_bh(&fq->lock);
 	set_bit(IEEE80211_TXQ_STOP, &txqi->flags);
-	spin_unlock_bh(&txqi->queue.lock);
+	spin_unlock_bh(&fq->lock);
 }
 
 static void
diff --git a/net/mac80211/codel.h b/net/mac80211/codel.h
new file mode 100644
index 000000000000..e6470dbe5b0b
--- /dev/null
+++ b/net/mac80211/codel.h
@@ -0,0 +1,264 @@ 
+#ifndef __NET_MAC80211_CODEL_H
+#define __NET_MAC80211_CODEL_H
+
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *  Copyright (C) 2016 Michael D. Taht <dave.taht@bufferbloat.net>
+ *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2015 Jonathan Morton <chromatix99@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/ktime.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <linux/reciprocal_div.h>
+
+#include "codel_i.h"
+
+/* Controlling Queue Delay (CoDel) algorithm
+ * =========================================
+ * Source : Kathleen Nichols and Van Jacobson
+ * http://queue.acm.org/detail.cfm?id=2209336
+ *
+ * Implemented on linux by Dave Taht and Eric Dumazet
+ */
+
+/* CoDel5 uses a real clock, unlike codel */
+
+static inline u64 codel_get_time(void)
+{
+	return ktime_get_ns();
+}
+
+static inline u32 codel_time_to_us(u64 val)
+{
+	do_div(val, NSEC_PER_USEC);
+	return (u32)val;
+}
+
+/* sizeof_in_bits(rec_inv_sqrt) */
+#define REC_INV_SQRT_BITS (8 * sizeof(u16))
+/* needed shift to get a Q0.32 number from rec_inv_sqrt */
+#define REC_INV_SQRT_SHIFT (32 - REC_INV_SQRT_BITS)
+
+/* Newton approximation method needs more iterations at small inputs,
+ * so cache them.
+ */
+
+static void codel_vars_init(struct codel_vars *vars)
+{
+	memset(vars, 0, sizeof(*vars));
+}
+
+/*
+ * http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Iterative_methods_for_reciprocal_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+static inline void codel_Newton_step(struct codel_vars *vars)
+{
+	u32 invsqrt = ((u32)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT;
+	u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+	u64 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+
+	val >>= 2; /* avoid overflow in following multiply */
+	val = (val * invsqrt) >> (32 - 2 + 1);
+
+	vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT;
+}
+
+/*
+ * CoDel control_law is t + interval/sqrt(count)
+ * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
+ * both sqrt() and divide operation.
+ */
+static u64 codel_control_law(u64 t,
+			     u64 interval,
+			     u32 rec_inv_sqrt)
+{
+	return t + reciprocal_scale(interval, rec_inv_sqrt <<
+				    REC_INV_SQRT_SHIFT);
+}
+
+/* Forward declaration of this for use elsewhere */
+
+static inline u64
+custom_codel_get_enqueue_time(struct sk_buff *skb);
+
+static inline struct sk_buff *
+custom_dequeue(struct codel_vars *vars, void *ptr);
+
+static inline void
+custom_drop(struct sk_buff *skb, void *ptr);
+
+static bool codel_should_drop(struct sk_buff *skb,
+			      __u32 *backlog,
+			      __u32 backlog_thr,
+			      struct codel_vars *vars,
+			      const struct codel_params *p,
+			      u64 now)
+{
+	if (!skb) {
+		vars->first_above_time = 0;
+		return false;
+	}
+
+	if (now - custom_codel_get_enqueue_time(skb) < p->target ||
+	    *backlog <= backlog_thr) {
+		/* went below - stay below for at least interval */
+		vars->first_above_time = 0;
+		return false;
+	}
+
+	if (vars->first_above_time == 0) {
+		/* just went above from below; mark the time */
+		vars->first_above_time = now + p->interval;
+
+	} else if (now > vars->first_above_time) {
+		return true;
+	}
+
+	return false;
+}
+
+static struct sk_buff *codel_dequeue(void *ptr,
+				     __u32 *backlog,
+				     __u32 backlog_thr,
+				     struct codel_vars *vars,
+				     struct codel_params *p,
+				     u64 now,
+				     bool overloaded)
+{
+	struct sk_buff *skb = custom_dequeue(vars, ptr);
+	bool drop;
+
+	if (!skb) {
+		vars->dropping = false;
+		return skb;
+	}
+	drop = codel_should_drop(skb, backlog, backlog_thr, vars, p, now);
+	if (vars->dropping) {
+		if (!drop) {
+			/* sojourn time below target - leave dropping state */
+			vars->dropping = false;
+		} else if (now >= vars->drop_next) {
+			/* It's time for the next drop. Drop the current
+			 * packet and dequeue the next. The dequeue might
+			 * take us out of dropping state.
+			 * If not, schedule the next drop.
+			 * A large backlog might result in drop rates so high
+			 * that the next drop should happen now,
+			 * hence the while loop.
+			 */
+
+			/* saturating increment */
+			vars->count++;
+			if (!vars->count)
+				vars->count--;
+
+			codel_Newton_step(vars);
+			vars->drop_next = codel_control_law(vars->drop_next,
+							    p->interval,
+							    vars->rec_inv_sqrt);
+			do {
+				if (INET_ECN_set_ce(skb) && !overloaded) {
+					vars->ecn_mark++;
+					/* and schedule the next drop */
+					vars->drop_next = codel_control_law(
+						vars->drop_next, p->interval,
+						vars->rec_inv_sqrt);
+					goto end;
+				}
+				custom_drop(skb, ptr);
+				vars->drop_count++;
+				skb = custom_dequeue(vars, ptr);
+				if (skb && !codel_should_drop(skb, backlog,
+							      backlog_thr,
+							      vars, p, now)) {
+					/* leave dropping state */
+					vars->dropping = false;
+				} else {
+					/* schedule the next drop */
+					vars->drop_next = codel_control_law(
+						vars->drop_next, p->interval,
+						vars->rec_inv_sqrt);
+				}
+			} while (skb && vars->dropping && now >=
+				 vars->drop_next);
+
+			/* Mark the packet regardless */
+			if (skb && INET_ECN_set_ce(skb))
+				vars->ecn_mark++;
+		}
+	} else if (drop) {
+		if (INET_ECN_set_ce(skb) && !overloaded) {
+			vars->ecn_mark++;
+		} else {
+			custom_drop(skb, ptr);
+			vars->drop_count++;
+
+			skb = custom_dequeue(vars, ptr);
+			drop = codel_should_drop(skb, backlog, backlog_thr,
+						 vars, p, now);
+			if (skb && INET_ECN_set_ce(skb))
+				vars->ecn_mark++;
+		}
+		vars->dropping = true;
+		/* if min went above target close to when we last went below
+		 * assume that the drop rate that controlled the queue on the
+		 * last cycle is a good starting point to control it now.
+		 */
+		if (vars->count > 2 &&
+		    now - vars->drop_next < 8 * p->interval) {
+			vars->count -= 2;
+			codel_Newton_step(vars);
+		} else {
+			vars->count = 1;
+			vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
+		}
+		codel_Newton_step(vars);
+		vars->drop_next = codel_control_law(now, p->interval,
+						    vars->rec_inv_sqrt);
+	}
+end:
+	return skb;
+}
+#endif
diff --git a/net/mac80211/codel_i.h b/net/mac80211/codel_i.h
new file mode 100644
index 000000000000..60371121e526
--- /dev/null
+++ b/net/mac80211/codel_i.h
@@ -0,0 +1,89 @@ 
+#ifndef __NET_MAC80211_CODEL_I_H
+#define __NET_MAC80211_CODEL_I_H
+
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *  Copyright (C) 2016 Michael D. Taht <dave.taht@bufferbloat.net>
+ *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2015 Jonathan Morton <chromatix99@gmail.com>
+ *  Copyright (C) 2016 Michal Kazior <michal.kazior@tieto.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/ktime.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <linux/reciprocal_div.h>
+
+/* Controlling Queue Delay (CoDel) algorithm
+ * =========================================
+ * Source : Kathleen Nichols and Van Jacobson
+ * http://queue.acm.org/detail.cfm?id=2209336
+ *
+ * Implemented on linux by Dave Taht and Eric Dumazet
+ */
+
+/* CoDel5 uses a real clock, unlike codel */
+
+#define MS2TIME(a) (a * (u64)NSEC_PER_MSEC)
+#define US2TIME(a) (a * (u64)NSEC_PER_USEC)
+
+/**
+ * struct codel_vars - contains codel variables
+ * @count:		how many drops we've done since the last time we
+ *			entered dropping state
+ * @dropping:		set to > 0 if in dropping state
+ * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
+ * @first_above_time:	when we went (or will go) continuously above target
+ *			for interval
+ * @drop_next:		time to drop next packet, or when we dropped last
+ * @drop_count:	temp count of dropped packets in dequeue()
+ * @ecn_mark:	number of packets we ECN marked instead of dropping
+ */
+
+struct codel_vars {
+	u32	count;
+	u16	dropping;
+	u16	rec_inv_sqrt;
+	u64	first_above_time;
+	u64	drop_next;
+	u16	drop_count;
+	u16	ecn_mark;
+};
+#endif
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c6830fbe7d68..3dc5192b6dd9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -805,9 +805,18 @@  enum txq_info_flags {
 };
 
 struct txq_info {
-	struct sk_buff_head queue;
+	struct txq_flow flow;
+	struct list_head new_flows;
+	struct list_head old_flows;
+	u32 backlog_bytes;
+	u32 backlog_packets;
+	u32 drop_codel;
+	u32 drop_overlimit;
+	u32 collisions;
+	u32 flows;
+	u32 tx_bytes;
+	u32 tx_packets;
 	unsigned long flags;
-	unsigned long byte_cnt;
 
 	/* keep last! */
 	struct ieee80211_txq txq;
@@ -855,7 +864,6 @@  struct ieee80211_sub_if_data {
 	bool control_port_no_encrypt;
 	int encrypt_headroom;
 
-	atomic_t txqs_len[IEEE80211_NUM_ACS];
 	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
 	struct mac80211_qos_map __rcu *qos_map;
 
@@ -1092,11 +1100,37 @@  enum mac80211_scan_state {
 	SCAN_ABORT,
 };
 
+/**
+ * struct codel_params - stores codel parameters
+ *
+ * @interval: initial drop rate
+ * @target: maximum persistent sojourn time
+ */
+struct codel_params {
+	u64 interval;
+	u64 target;
+};
+
+struct ieee80211_fq {
+	struct txq_flow *flows;
+	struct list_head backlogs;
+	struct codel_params cparams;
+	spinlock_t lock;
+	u32 flows_cnt;
+	u32 perturbation;
+	u32 txq_limit;
+	u32 quantum;
+	u32 backlog;
+	u32 drop_overlimit;
+	u32 drop_codel;
+};
+
 struct ieee80211_local {
 	/* embed the driver visible part.
 	 * don't cast (use the static inlines below), but we keep
 	 * it first anyway so they become a no-op */
 	struct ieee80211_hw hw;
+	struct ieee80211_fq fq;
 
 	const struct ieee80211_ops *ops;
 
@@ -1928,6 +1962,11 @@  static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
 void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
 			     struct sta_info *sta,
 			     struct txq_info *txq, int tid);
+void ieee80211_purge_txq(struct ieee80211_local *local, struct txq_info *txqi);
+void ieee80211_init_flow(struct txq_flow *flow);
+int ieee80211_setup_flows(struct ieee80211_local *local);
+void ieee80211_teardown_flows(struct ieee80211_local *local);
+
 void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 			 u16 transaction, u16 auth_alg, u16 status,
 			 const u8 *extra, size_t extra_len, const u8 *bssid,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 453b4e741780..1faea208edfc 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -779,6 +779,7 @@  static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 			      bool going_down)
 {
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_fq *fq = &local->fq;
 	unsigned long flags;
 	struct sk_buff *skb, *tmp;
 	u32 hw_reconf_flags = 0;
@@ -977,12 +978,9 @@  static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 	if (sdata->vif.txq) {
 		struct txq_info *txqi = to_txq_info(sdata->vif.txq);
 
-		spin_lock_bh(&txqi->queue.lock);
-		ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
-		txqi->byte_cnt = 0;
-		spin_unlock_bh(&txqi->queue.lock);
-
-		atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
+		spin_lock_bh(&fq->lock);
+		ieee80211_purge_txq(local, txqi);
+		spin_unlock_bh(&fq->lock);
 	}
 
 	if (local->open_count == 0)
@@ -1198,6 +1196,12 @@  static void ieee80211_if_setup(struct net_device *dev)
 	dev->destructor = ieee80211_if_free;
 }
 
+static void ieee80211_if_setup_no_queue(struct net_device *dev)
+{
+	ieee80211_if_setup(dev);
+	dev->priv_flags |= IFF_NO_QUEUE;
+}
+
 static void ieee80211_iface_work(struct work_struct *work)
 {
 	struct ieee80211_sub_if_data *sdata =
@@ -1707,6 +1711,7 @@  int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 	struct net_device *ndev = NULL;
 	struct ieee80211_sub_if_data *sdata = NULL;
 	struct txq_info *txqi;
+	void (*if_setup)(struct net_device *dev);
 	int ret, i;
 	int txqs = 1;
 
@@ -1734,12 +1739,17 @@  int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 			txq_size += sizeof(struct txq_info) +
 				    local->hw.txq_data_size;
 
+		if (local->ops->wake_tx_queue)
+			if_setup = ieee80211_if_setup_no_queue;
+		else
+			if_setup = ieee80211_if_setup;
+
 		if (local->hw.queues >= IEEE80211_NUM_ACS)
 			txqs = IEEE80211_NUM_ACS;
 
 		ndev = alloc_netdev_mqs(size + txq_size,
 					name, name_assign_type,
-					ieee80211_if_setup, txqs, 1);
+					if_setup, txqs, 1);
 		if (!ndev)
 			return -ENOMEM;
 		dev_net_set(ndev, wiphy_net(local->hw.wiphy));
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 8190bf27ebff..9fd3b10ae52b 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1053,9 +1053,6 @@  int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	local->dynamic_ps_forced_timeout = -1;
 
-	if (!local->hw.txq_ac_max_pending)
-		local->hw.txq_ac_max_pending = 64;
-
 	result = ieee80211_wep_init(local);
 	if (result < 0)
 		wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
@@ -1087,6 +1084,10 @@  int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	rtnl_unlock();
 
+	result = ieee80211_setup_flows(local);
+	if (result)
+		goto fail_flows;
+
 #ifdef CONFIG_INET
 	local->ifa_notifier.notifier_call = ieee80211_ifa_changed;
 	result = register_inetaddr_notifier(&local->ifa_notifier);
@@ -1112,6 +1113,8 @@  int ieee80211_register_hw(struct ieee80211_hw *hw)
 #if defined(CONFIG_INET) || defined(CONFIG_IPV6)
  fail_ifa:
 #endif
+	ieee80211_teardown_flows(local);
+ fail_flows:
 	rtnl_lock();
 	rate_control_deinitialize(local);
 	ieee80211_remove_interfaces(local);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 91279576f4a7..10a6e9c3de51 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1268,7 +1268,7 @@  static void sta_ps_start(struct sta_info *sta)
 	for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
 		struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
 
-		if (!skb_queue_len(&txqi->queue))
+		if (!txqi->backlog_packets)
 			set_bit(tid, &sta->txq_buffered_tids);
 		else
 			clear_bit(tid, &sta->txq_buffered_tids);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 00c82fb152c0..0729046a0144 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -112,11 +112,7 @@  static void __cleanup_single_sta(struct sta_info *sta)
 	if (sta->sta.txq[0]) {
 		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
 			struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
-			int n = skb_queue_len(&txqi->queue);
-
-			ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
-			atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]);
-			txqi->byte_cnt = 0;
+			ieee80211_purge_txq(local, txqi);
 		}
 	}
 
@@ -1193,7 +1189,7 @@  void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
 		for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
 			struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
 
-			if (!skb_queue_len(&txqi->queue))
+			if (!txqi->backlog_packets)
 				continue;
 
 			drv_wake_tx_queue(local, txqi);
@@ -1630,7 +1626,7 @@  ieee80211_sta_ps_deliver_response(struct sta_info *sta,
 		for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
 			struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
 
-			if (!(tids & BIT(tid)) || skb_queue_len(&txqi->queue))
+			if (!(tids & BIT(tid)) || txqi->backlog_packets)
 				continue;
 
 			sta_info_recalc_tim(sta);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 053f5c4fa495..4f7ad9158d31 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -19,6 +19,7 @@ 
 #include <linux/etherdevice.h>
 #include <linux/rhashtable.h>
 #include "key.h"
+#include "codel_i.h"
 
 /**
  * enum ieee80211_sta_info_flags - Stations flags
@@ -330,6 +331,32 @@  struct mesh_sta {
 
 DECLARE_EWMA(signal, 1024, 8)
 
+struct txq_info;
+
+/**
+ * struct txq_flow - per traffic flow queue
+ *
+ * This structure is used to distinguish and queue different traffic flows
+ * separately for fair queueing/AQM purposes.
+ *
+ * @txqi: txq_info structure it is associated at given time
+ * @flowchain: can be linked to other flows for RR purposes
+ * @backlogchain: can be linked to other flows for backlog sorting purposes
+ * @queue: sk_buff queue
+ * @cvars: codel state vars
+ * @backlog: number of bytes pending in the queue
+ * @deficit: used for fair queueing balancing
+ */
+struct txq_flow {
+	struct txq_info *txqi;
+	struct list_head flowchain;
+	struct list_head backlogchain;
+	struct sk_buff_head queue;
+	struct codel_vars cvars;
+	u32 backlog;
+	int deficit;
+};
+
 /**
  * struct sta_info - STA information
  *
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 485e30a24b38..dd65e34f7107 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -34,6 +34,7 @@ 
 #include "wpa.h"
 #include "wme.h"
 #include "rate.h"
+#include "codel.h"
 
 /* misc utils */
 
@@ -1232,27 +1233,323 @@  ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
 	return TX_CONTINUE;
 }
 
-static void ieee80211_drv_tx(struct ieee80211_local *local,
-			     struct ieee80211_vif *vif,
-			     struct ieee80211_sta *pubsta,
-			     struct sk_buff *skb)
+static inline u64
+custom_codel_get_enqueue_time(struct sk_buff *skb)
+{
+	return IEEE80211_SKB_CB(skb)->control.enqueue_time;
+}
+
+static inline struct sk_buff *
+flow_dequeue(struct ieee80211_local *local, struct txq_flow *flow)
+{
+	struct ieee80211_fq *fq = &local->fq;
+	struct txq_info *txqi = flow->txqi;
+	struct txq_flow *i;
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue(&flow->queue);
+	if (!skb)
+		return NULL;
+
+	txqi->backlog_bytes -= skb->len;
+	txqi->backlog_packets--;
+	flow->backlog -= skb->len;
+	fq->backlog--;
+
+	if (flow->backlog == 0) {
+		list_del_init(&flow->backlogchain);
+	} else {
+		i = flow;
+
+		list_for_each_entry_continue(i, &fq->backlogs, backlogchain)
+			if (i->backlog < flow->backlog)
+				break;
+
+		list_move_tail(&flow->backlogchain, &i->backlogchain);
+	}
+
+	return skb;
+}
+
+static inline struct sk_buff *
+custom_dequeue(struct codel_vars *vars, void *ptr)
+{
+	struct txq_flow *flow = ptr;
+	struct txq_info *txqi = flow->txqi;
+	struct ieee80211_vif *vif = txqi->txq.vif;
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+
+	return flow_dequeue(local, flow);
+}
+
+static inline void
+custom_drop(struct sk_buff *skb, void *ptr)
+{
+	struct txq_flow *flow = ptr;
+	struct txq_info *txqi = flow->txqi;
+	struct ieee80211_vif *vif = txqi->txq.vif;
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_hw *hw = &local->hw;
+
+	ieee80211_free_txskb(hw, skb);
+
+	txqi->drop_codel++;
+	local->fq.drop_codel++;
+}
+
+static u32 fq_hash(struct ieee80211_fq *fq, struct sk_buff *skb)
+{
+	u32 hash = skb_get_hash_perturb(skb, fq->perturbation);
+
+	return reciprocal_scale(hash, fq->flows_cnt);
+}
+
+static void fq_drop(struct ieee80211_local *local)
+{
+	struct ieee80211_hw *hw = &local->hw;
+	struct ieee80211_fq *fq = &local->fq;
+	struct txq_flow *flow;
+	struct sk_buff *skb;
+
+	flow = list_first_entry_or_null(&fq->backlogs, struct txq_flow,
+					backlogchain);
+	if (WARN_ON_ONCE(!flow))
+		return;
+
+	skb = flow_dequeue(local, flow);
+	if (WARN_ON_ONCE(!skb))
+		return;
+
+	ieee80211_free_txskb(hw, skb);
+
+	flow->txqi->drop_overlimit++;
+	fq->drop_overlimit++;
+}
+
+void ieee80211_init_flow(struct txq_flow *flow)
+{
+	INIT_LIST_HEAD(&flow->flowchain);
+	INIT_LIST_HEAD(&flow->backlogchain);
+	__skb_queue_head_init(&flow->queue);
+	codel_vars_init(&flow->cvars);
+}
+
+int ieee80211_setup_flows(struct ieee80211_local *local)
+{
+	struct ieee80211_fq *fq = &local->fq;
+	int i;
+
+	if (!local->ops->wake_tx_queue)
+		return 0;
+
+	memset(fq, 0, sizeof(fq[0]));
+	INIT_LIST_HEAD(&fq->backlogs);
+	spin_lock_init(&fq->lock);
+	fq->flows_cnt = 4096;
+	fq->perturbation = prandom_u32();
+	fq->quantum = 300;
+	fq->txq_limit = 8192;
+	fq->cparams.target = MS2TIME(20);
+	fq->cparams.interval = MS2TIME(100);
+
+	fq->flows = kcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL);
+	if (!fq->flows)
+		return -ENOMEM;
+
+	for (i = 0; i < fq->flows_cnt; i++)
+		ieee80211_init_flow(&fq->flows[i]);
+
+	return 0;
+}
+
+static void ieee80211_reset_flow(struct ieee80211_local *local,
+				 struct txq_flow *flow)
+{
+	if (!list_empty(&flow->flowchain))
+		list_del_init(&flow->flowchain);
+
+	if (!list_empty(&flow->backlogchain))
+		list_del_init(&flow->backlogchain);
+
+	ieee80211_purge_tx_queue(&local->hw, &flow->queue);
+
+	flow->deficit = 0;
+	flow->txqi = NULL;
+}
+
+void ieee80211_purge_txq(struct ieee80211_local *local, struct txq_info *txqi)
+{
+	struct txq_flow *flow;
+	int i;
+
+	for (i = 0; i < local->fq.flows_cnt; i++) {
+		flow = &local->fq.flows[i];
+
+		if (flow->txqi != txqi)
+			continue;
+
+		ieee80211_reset_flow(local, flow);
+	}
+
+	ieee80211_reset_flow(local, &txqi->flow);
+
+	txqi->backlog_bytes = 0;
+	txqi->backlog_packets = 0;
+}
+
+void ieee80211_teardown_flows(struct ieee80211_local *local)
+{
+	struct ieee80211_fq *fq = &local->fq;
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta;
+	int i;
+
+	if (!local->ops->wake_tx_queue)
+		return;
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list)
+		for (i = 0; i < IEEE80211_NUM_TIDS; i++)
+			ieee80211_purge_txq(local,
+					    to_txq_info(sta->sta.txq[i]));
+
+	list_for_each_entry_rcu(sdata, &local->interfaces, list)
+		ieee80211_purge_txq(local, to_txq_info(sdata->vif.txq));
+
+	for (i = 0; i < fq->flows_cnt; i++)
+		ieee80211_reset_flow(local, &fq->flows[i]);
+
+	kfree(fq->flows);
+
+	fq->flows = NULL;
+	fq->flows_cnt = 0;
+}
+
+static void ieee80211_txq_enqueue(struct ieee80211_local *local,
+				  struct txq_info *txqi,
+				  struct sk_buff *skb)
+{
+	struct ieee80211_fq *fq = &local->fq;
+	struct txq_flow *flow;
+	struct txq_flow *i;
+	size_t idx = fq_hash(fq, skb);
+
+	lockdep_assert_held(&fq->lock);
+
+	flow = &fq->flows[idx];
+
+	if (flow->txqi && flow->txqi != txqi) {
+		flow = &txqi->flow;
+		txqi->collisions++;
+	}
+
+	if (!flow->txqi)
+		txqi->flows++;
+
+	/* The following overwrites `vif` pointer effectively. It is later
+	 * restored using txq structure.
+	 */
+	IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
+
+	flow->txqi = txqi;
+	flow->backlog += skb->len;
+	txqi->backlog_bytes += skb->len;
+	txqi->backlog_packets++;
+	fq->backlog++;
+
+	if (list_empty(&flow->backlogchain))
+		list_add_tail(&flow->backlogchain, &fq->backlogs);
+
+	i = flow;
+	list_for_each_entry_continue_reverse(i, &fq->backlogs, backlogchain)
+		if (i->backlog > flow->backlog)
+			break;
+
+	list_move(&flow->backlogchain, &i->backlogchain);
+
+	if (list_empty(&flow->flowchain)) {
+		flow->deficit = fq->quantum;
+		list_add_tail(&flow->flowchain, &txqi->new_flows);
+	}
+
+	__skb_queue_tail(&flow->queue, skb);
+
+	if (fq->backlog > fq->txq_limit)
+		fq_drop(local);
+}
+
+static struct sk_buff *ieee80211_txq_dequeue(struct ieee80211_local *local,
+					     struct txq_info *txqi)
+{
+	struct ieee80211_fq *fq = &local->fq;
+	struct txq_flow *flow;
+	struct list_head *head;
+	struct sk_buff *skb;
+
+begin:
+	head = &txqi->new_flows;
+	if (list_empty(head)) {
+		head = &txqi->old_flows;
+		if (list_empty(head))
+			return NULL;
+	}
+
+	flow = list_first_entry(head, struct txq_flow, flowchain);
+
+	if (flow->deficit <= 0) {
+		flow->deficit += fq->quantum;
+		list_move_tail(&flow->flowchain, &txqi->old_flows);
+		goto begin;
+	}
+
+	skb = codel_dequeue(flow,
+			    &flow->backlog,
+			    0,
+			    &flow->cvars,
+			    &fq->cparams,
+			    codel_get_time(),
+			    false);
+	if (!skb) {
+		if ((head == &txqi->new_flows) &&
+		    !list_empty(&txqi->old_flows)) {
+			list_move_tail(&flow->flowchain, &txqi->old_flows);
+		} else {
+			list_del_init(&flow->flowchain);
+			flow->txqi = NULL;
+		}
+		goto begin;
+	}
+
+	flow->deficit -= skb->len;
+	txqi->tx_bytes += skb->len;
+	txqi->tx_packets++;
+
+	/* The `vif` pointer was overwritten with enqueue time during
+	 * enqueuing. Restore it before handing to driver.
+	 */
+	IEEE80211_SKB_CB(skb)->control.vif = flow->txqi->txq.vif;
+
+	return skb;
+}
+
+static struct txq_info *
+ieee80211_get_txq(struct ieee80211_local *local,
+		  struct ieee80211_vif *vif,
+		  struct ieee80211_sta *pubsta,
+		  struct sk_buff *skb)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-	struct ieee80211_tx_control control = {
-		.sta = pubsta,
-	};
 	struct ieee80211_txq *txq = NULL;
-	struct txq_info *txqi;
-	u8 ac;
 
 	if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
+	    (info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK) ||
 	    (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
-		goto tx_normal;
+		return NULL;
 
 	if (!ieee80211_is_data(hdr->frame_control))
-		goto tx_normal;
+		return NULL;
 
 	if (pubsta) {
 		u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
@@ -1263,52 +1560,29 @@  static void ieee80211_drv_tx(struct ieee80211_local *local,
 	}
 
 	if (!txq)
-		goto tx_normal;
+		return NULL;
 
-	ac = txq->ac;
-	txqi = to_txq_info(txq);
-	atomic_inc(&sdata->txqs_len[ac]);
-	if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending)
-		netif_stop_subqueue(sdata->dev, ac);
-
-	spin_lock_bh(&txqi->queue.lock);
-	txqi->byte_cnt += skb->len;
-	__skb_queue_tail(&txqi->queue, skb);
-	spin_unlock_bh(&txqi->queue.lock);
-
-	drv_wake_tx_queue(local, txqi);
-
-	return;
-
-tx_normal:
-	drv_tx(local, &control, skb);
+	return to_txq_info(txq);
 }
 
 struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 				     struct ieee80211_txq *txq)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
-	struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif);
+	struct ieee80211_fq *fq = &local->fq;
 	struct txq_info *txqi = container_of(txq, struct txq_info, txq);
 	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb = NULL;
-	u8 ac = txq->ac;
 
-	spin_lock_bh(&txqi->queue.lock);
+	spin_lock_bh(&fq->lock);
 
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
 
-	skb = __skb_dequeue(&txqi->queue);
+	skb = ieee80211_txq_dequeue(local, txqi);
 	if (!skb)
 		goto out;
 
-	txqi->byte_cnt -= skb->len;
-
-	atomic_dec(&sdata->txqs_len[ac]);
-	if (__netif_subqueue_stopped(sdata->dev, ac))
-		ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]);
-
 	hdr = (struct ieee80211_hdr *)skb->data;
 	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
 		struct sta_info *sta = container_of(txq->sta, struct sta_info,
@@ -1323,7 +1597,7 @@  struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	}
 
 out:
-	spin_unlock_bh(&txqi->queue.lock);
+	spin_unlock_bh(&fq->lock);
 
 	return skb;
 }
@@ -1335,7 +1609,10 @@  static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct sk_buff_head *skbs,
 			       bool txpending)
 {
+	struct ieee80211_fq *fq = &local->fq;
+	struct ieee80211_tx_control control = {};
 	struct sk_buff *skb, *tmp;
+	struct txq_info *txqi;
 	unsigned long flags;
 
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1350,6 +1627,21 @@  static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		}
 #endif
 
+		txqi = ieee80211_get_txq(local, vif, sta, skb);
+		if (txqi) {
+			info->control.vif = vif;
+
+			__skb_unlink(skb, skbs);
+
+			spin_lock_bh(&fq->lock);
+			ieee80211_txq_enqueue(local, txqi, skb);
+			spin_unlock_bh(&fq->lock);
+
+			drv_wake_tx_queue(local, txqi);
+
+			continue;
+		}
+
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1392,9 +1684,10 @@  static bool ieee80211_tx_frags(struct ieee80211_local *local,
 		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
 
 		info->control.vif = vif;
+		control.sta = sta;
 
 		__skb_unlink(skb, skbs);
-		ieee80211_drv_tx(local, vif, sta, skb);
+		drv_tx(local, &control, skb);
 	}
 
 	return true;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 0319d6d4f863..cbcdf7cf9679 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -244,6 +244,9 @@  void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
 	struct ieee80211_sub_if_data *sdata;
 	int n_acs = IEEE80211_NUM_ACS;
 
+	if (local->ops->wake_tx_queue)
+		return;
+
 	if (local->hw.queues < IEEE80211_NUM_ACS)
 		n_acs = 1;
 
@@ -260,11 +263,6 @@  void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
 		for (ac = 0; ac < n_acs; ac++) {
 			int ac_queue = sdata->vif.hw_queue[ac];
 
-			if (local->ops->wake_tx_queue &&
-			    (atomic_read(&sdata->txqs_len[ac]) >
-			     local->hw.txq_ac_max_pending))
-				continue;
-
 			if (ac_queue == queue ||
 			    (sdata->vif.cab_queue == queue &&
 			     local->queue_stop_reasons[ac_queue] == 0 &&
@@ -352,6 +350,9 @@  static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
 	if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue]))
 		return;
 
+	if (local->ops->wake_tx_queue)
+		return;
+
 	if (local->hw.queues < IEEE80211_NUM_ACS)
 		n_acs = 1;
 
@@ -3392,8 +3393,11 @@  void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
 			     struct sta_info *sta,
 			     struct txq_info *txqi, int tid)
 {
-	skb_queue_head_init(&txqi->queue);
+	INIT_LIST_HEAD(&txqi->old_flows);
+	INIT_LIST_HEAD(&txqi->new_flows);
+	ieee80211_init_flow(&txqi->flow);
 	txqi->txq.vif = &sdata->vif;
+	txqi->flow.txqi = txqi;
 
 	if (sta) {
 		txqi->txq.sta = &sta->sta;
@@ -3414,9 +3418,9 @@  void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
 	struct txq_info *txqi = to_txq_info(txq);
 
 	if (frame_cnt)
-		*frame_cnt = txqi->queue.qlen;
+		*frame_cnt = txqi->backlog_packets;
 
 	if (byte_cnt)
-		*byte_cnt = txqi->byte_cnt;
+		*byte_cnt = txqi->backlog_bytes;
 }
 EXPORT_SYMBOL(ieee80211_txq_get_depth);