diff mbox series

[1/6] mt76: use mac80211 txq scheduling

Message ID 20190316204242.73560-1-nbd@nbd.name (mailing list archive)
State Changes Requested
Delegated to: Felix Fietkau
Headers show
Series [1/6] mt76: use mac80211 txq scheduling | expand

Commit Message

Felix Fietkau March 16, 2019, 8:42 p.m. UTC
Performance improvement and preparation for adding airtime fairness support

Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 drivers/net/wireless/mediatek/mt76/dma.c  |  6 +-
 drivers/net/wireless/mediatek/mt76/mt76.h |  3 +-
 drivers/net/wireless/mediatek/mt76/tx.c   | 98 ++++++++++-------------
 drivers/net/wireless/mediatek/mt76/usb.c  |  3 +-
 4 files changed, 50 insertions(+), 60 deletions(-)

Comments

Toke Høiland-Jørgensen March 16, 2019, 10:28 p.m. UTC | #1
Felix Fietkau <nbd@nbd.name> writes:

> Performance improvement and preparation for adding airtime fairness
> support

Great to see this! Do you have a plan for the airtime fairness part?
I.e., how to get the airtime information?

Only one other comment, below.

> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
>  drivers/net/wireless/mediatek/mt76/dma.c  |  6 +-
>  drivers/net/wireless/mediatek/mt76/mt76.h |  3 +-
>  drivers/net/wireless/mediatek/mt76/tx.c   | 98 ++++++++++-------------
>  drivers/net/wireless/mediatek/mt76/usb.c  |  3 +-
>  4 files changed, 50 insertions(+), 60 deletions(-)
>
> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
> index 7b8a998103d7..09978757e7d1 100644
> --- a/drivers/net/wireless/mediatek/mt76/dma.c
> +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> @@ -184,9 +184,7 @@ mt76_dma_tx_cleanup(struct mt76_dev *dev, enum mt76_txq_id qid, bool flush)
>  			last = ioread32(&q->regs->dma_idx);
>  	}
>  
> -	if (!flush)
> -		mt76_txq_schedule(dev, sq);
> -	else
> +	if (flush)
>  		mt76_dma_sync_idx(dev, q);
>  
>  	wake = wake && q->stopped &&
> @@ -199,6 +197,8 @@ mt76_dma_tx_cleanup(struct mt76_dev *dev, enum mt76_txq_id qid, bool flush)
>  
>  	spin_unlock_bh(&q->lock);
>  
> +	if (!flush)
> +		mt76_txq_schedule(dev, qid);
>  	if (wake)
>  		ieee80211_wake_queue(dev->hw, qid);
>  }
> diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
> index edff44f32c8e..5d44f721d184 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt76.h
> +++ b/drivers/net/wireless/mediatek/mt76/mt76.h
> @@ -224,7 +224,6 @@ struct mt76_wcid {
>  };
>  
>  struct mt76_txq {
> -	struct list_head list;
>  	struct mt76_sw_queue *swq;
>  	struct mt76_wcid *wcid;
>  
> @@ -683,7 +682,7 @@ void mt76_txq_remove(struct mt76_dev *dev, struct ieee80211_txq *txq);
>  void mt76_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq);
>  void mt76_stop_tx_queues(struct mt76_dev *dev, struct ieee80211_sta *sta,
>  			 bool send_bar);
> -void mt76_txq_schedule(struct mt76_dev *dev, struct mt76_sw_queue *sq);
> +void mt76_txq_schedule(struct mt76_dev *dev, enum mt76_txq_id qid);
>  void mt76_txq_schedule_all(struct mt76_dev *dev);
>  void mt76_release_buffered_frames(struct ieee80211_hw *hw,
>  				  struct ieee80211_sta *sta,
> diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
> index 2c82db0b5834..ef9a0bbd64c1 100644
> --- a/drivers/net/wireless/mediatek/mt76/tx.c
> +++ b/drivers/net/wireless/mediatek/mt76/tx.c
> @@ -479,23 +479,37 @@ mt76_txq_send_burst(struct mt76_dev *dev, struct mt76_sw_queue *sq,
>  }
>  
>  static int
> -mt76_txq_schedule_list(struct mt76_dev *dev, struct mt76_sw_queue *sq)
> +mt76_txq_schedule_list(struct mt76_dev *dev, enum mt76_txq_id qid)
>  {
> +	struct mt76_sw_queue *sq = &dev->q_tx[qid];
>  	struct mt76_queue *hwq = sq->q;
> -	struct mt76_txq *mtxq, *mtxq_last;
> -	int len = 0;
> +	struct ieee80211_txq *txq;
> +	struct mt76_txq *mtxq;
> +	struct mt76_wcid *wcid;
> +	int ret = 0;
>  
> -restart:
> -	mtxq_last = list_last_entry(&sq->swq, struct mt76_txq, list);
> -	while (!list_empty(&sq->swq)) {
> +	spin_lock_bh(&hwq->lock);
> +	while (1) {
>  		bool empty = false;
> -		int cur;
> +
> +		if (sq->swq_queued >= 4)
> +			break;
>  
>  		if (test_bit(MT76_OFFCHANNEL, &dev->state) ||
> -		    test_bit(MT76_RESET, &dev->state))
> -			return -EBUSY;
> +		    test_bit(MT76_RESET, &dev->state)) {
> +			ret = -EBUSY;
> +			break;
> +		}
> +
> +		txq = ieee80211_next_txq(dev->hw, qid);
> +		if (!txq)
> +			break;
> +
> +		mtxq = (struct mt76_txq *)txq->drv_priv;
> +		wcid = mtxq->wcid;
> +		if (wcid && test_bit(MT_WCID_FLAG_PS, &wcid->flags))
> +			continue;
>  
> -		mtxq = list_first_entry(&sq->swq, struct mt76_txq, list);
>  		if (mtxq->send_bar && mtxq->aggr) {
>  			struct ieee80211_txq *txq = mtxq_to_txq(mtxq);
>  			struct ieee80211_sta *sta = txq->sta;
> @@ -507,38 +521,36 @@ mt76_txq_schedule_list(struct mt76_dev *dev, struct mt76_sw_queue *sq)
>  			spin_unlock_bh(&hwq->lock);
>  			ieee80211_send_bar(vif, sta->addr, tid, agg_ssn);
>  			spin_lock_bh(&hwq->lock);
> -			goto restart;
>  		}
>  
> -		list_del_init(&mtxq->list);
> -
> -		cur = mt76_txq_send_burst(dev, sq, mtxq, &empty);
> +		ret += mt76_txq_send_burst(dev, sq, mtxq, &empty);
>  		if (!empty)
> -			list_add_tail(&mtxq->list, &sq->swq);
> -
> -		if (cur < 0)
> -			return cur;
> -
> -		len += cur;
> -
> -		if (mtxq == mtxq_last)
> -			break;
> +			ieee80211_return_txq(dev->hw, txq);

The call to ieee80211_return_txq() is really meant to be unconditional.
The TXQ will only actually be scheduled if it still has packets queued.
I know it's slightly more expensive to have the check in mac80211, but
this is what makes it possible to change the implementation without
touching the drivers (such as in the RFC patch I sent earlier that
switches the scheduling algorithm)...

-Toke
Felix Fietkau March 17, 2019, 10:44 a.m. UTC | #2
On 2019-03-16 23:28, Toke Høiland-Jørgensen wrote:
> Felix Fietkau <nbd@nbd.name> writes:
> 
>> Performance improvement and preparation for adding airtime fairness
>> support
> 
> Great to see this! Do you have a plan for the airtime fairness part?
> I.e., how to get the airtime information?
Not yet. Still need to investigate what kind of information the hardware
can provide. On a first glance it seems rather limited, so we may have
to approximate based on tx status rates/retry and average packet size.

> The call to ieee80211_return_txq() is really meant to be unconditional.
> The TXQ will only actually be scheduled if it still has packets queued.
> I know it's slightly more expensive to have the check in mac80211, but
> this is what makes it possible to change the implementation without
> touching the drivers (such as in the RFC patch I sent earlier that
> switches the scheduling algorithm)...
I think this API needs to be extended to allow the driver to specify
that it has buffered packets for a txq. Otherwise there's a small window
where the driver has packets for a txq but mac80211 doesn't, and
mac80211 won't schedule the queue in that case.
I'll send a patch for this soon.

- Felix
Toke Høiland-Jørgensen March 17, 2019, 11:32 a.m. UTC | #3
Felix Fietkau <nbd@nbd.name> writes:

> On 2019-03-16 23:28, Toke Høiland-Jørgensen wrote:
>> Felix Fietkau <nbd@nbd.name> writes:
>> 
>>> Performance improvement and preparation for adding airtime fairness
>>> support
>> 
>> Great to see this! Do you have a plan for the airtime fairness part?
>> I.e., how to get the airtime information?
> Not yet. Still need to investigate what kind of information the hardware
> can provide. On a first glance it seems rather limited, so we may have
> to approximate based on tx status rates/retry and average packet size.

OK, cool. A byte-based estimator can also be useful for preventing dumb
firmware from buffering too much. The Chromium guys did that for ath10k:

https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/588190/13/drivers/net/wireless-4.2/ath/ath10k/mac.c#3826

>> The call to ieee80211_return_txq() is really meant to be unconditional.
>> The TXQ will only actually be scheduled if it still has packets queued.
>> I know it's slightly more expensive to have the check in mac80211, but
>> this is what makes it possible to change the implementation without
>> touching the drivers (such as in the RFC patch I sent earlier that
>> switches the scheduling algorithm)...
> I think this API needs to be extended to allow the driver to specify
> that it has buffered packets for a txq. Otherwise there's a small window
> where the driver has packets for a txq but mac80211 doesn't, and
> mac80211 won't schedule the queue in that case.
> I'll send a patch for this soon.

Right, makes sense. As long as mac80211 is in control over how it will
react to that information (thus allowing to e.g., invert the logic if
needed), I have no objections to extending the API... :)

-Toke
Felix Fietkau March 17, 2019, 12:32 p.m. UTC | #4
On 2019-03-17 12:32, Toke Høiland-Jørgensen wrote:
> Felix Fietkau <nbd@nbd.name> writes:
> 
>> On 2019-03-16 23:28, Toke Høiland-Jørgensen wrote:
>>> Felix Fietkau <nbd@nbd.name> writes:
>>> 
>>>> Performance improvement and preparation for adding airtime fairness
>>>> support
>>> 
>>> Great to see this! Do you have a plan for the airtime fairness part?
>>> I.e., how to get the airtime information?
>> Not yet. Still need to investigate what kind of information the hardware
>> can provide. On a first glance it seems rather limited, so we may have
>> to approximate based on tx status rates/retry and average packet size.
> 
> OK, cool. A byte-based estimator can also be useful for preventing dumb
> firmware from buffering too much. The Chromium guys did that for ath10k:
> 
> https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/588190/13/drivers/net/wireless-4.2/ath/ath10k/mac.c#3826
Interesting, thanks. I can probably use some ideas from that.

>>> The call to ieee80211_return_txq() is really meant to be unconditional.
>>> The TXQ will only actually be scheduled if it still has packets queued.
>>> I know it's slightly more expensive to have the check in mac80211, but
>>> this is what makes it possible to change the implementation without
>>> touching the drivers (such as in the RFC patch I sent earlier that
>>> switches the scheduling algorithm)...
>> I think this API needs to be extended to allow the driver to specify
>> that it has buffered packets for a txq. Otherwise there's a small window
>> where the driver has packets for a txq but mac80211 doesn't, and
>> mac80211 won't schedule the queue in that case.
>> I'll send a patch for this soon.
> 
> Right, makes sense. As long as mac80211 is in control over how it will
> react to that information (thus allowing to e.g., invert the logic if
> needed), I have no objections to extending the API... :)I'm thinking of changing the code to make ieee80211_schedule_txq add the
txq to the list, even if mac80211 does not have any frames buffered for it.

I've looked at ath9k (the only user at the moment), and it seems to call
the function in that way already: at PS wake or tx status time if it has
frames in its internal retry queue.
While it does not match the current documented behavior for that
function, it nicely fits ath9k's currently unfulfilled expectations ;)

- Felix
Toke Høiland-Jørgensen March 17, 2019, 9:59 p.m. UTC | #5
Felix Fietkau <nbd@nbd.name> writes:

> On 2019-03-17 12:32, Toke Høiland-Jørgensen wrote:
>> Felix Fietkau <nbd@nbd.name> writes:
>> 
>>> On 2019-03-16 23:28, Toke Høiland-Jørgensen wrote:
>>>> Felix Fietkau <nbd@nbd.name> writes:
>>>> 
>>>>> Performance improvement and preparation for adding airtime fairness
>>>>> support
>>>> 
>>>> Great to see this! Do you have a plan for the airtime fairness part?
>>>> I.e., how to get the airtime information?
>>> Not yet. Still need to investigate what kind of information the hardware
>>> can provide. On a first glance it seems rather limited, so we may have
>>> to approximate based on tx status rates/retry and average packet size.
>> 
>> OK, cool. A byte-based estimator can also be useful for preventing dumb
>> firmware from buffering too much. The Chromium guys did that for ath10k:
>> 
>> https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/588190/13/drivers/net/wireless-4.2/ath/ath10k/mac.c#3826
> Interesting, thanks. I can probably use some ideas from that.
>
>>>> The call to ieee80211_return_txq() is really meant to be unconditional.
>>>> The TXQ will only actually be scheduled if it still has packets queued.
>>>> I know it's slightly more expensive to have the check in mac80211, but
>>>> this is what makes it possible to change the implementation without
>>>> touching the drivers (such as in the RFC patch I sent earlier that
>>>> switches the scheduling algorithm)...
>>> I think this API needs to be extended to allow the driver to specify
>>> that it has buffered packets for a txq. Otherwise there's a small window
>>> where the driver has packets for a txq but mac80211 doesn't, and
>>> mac80211 won't schedule the queue in that case.
>>> I'll send a patch for this soon.
>> 
>> Right, makes sense. As long as mac80211 is in control over how it will
>> react to that information (thus allowing to e.g., invert the logic if
>> needed), I have no objections to extending the API... :)I'm thinking of changing the code to make ieee80211_schedule_txq add the
> txq to the list, even if mac80211 does not have any frames buffered for it.
>
> I've looked at ath9k (the only user at the moment), and it seems to call
> the function in that way already: at PS wake or tx status time if it has
> frames in its internal retry queue.
> While it does not match the current documented behavior for that
> function, it nicely fits ath9k's currently unfulfilled expectations ;)

Heh, fair point :)

-Toke
Felix Fietkau March 18, 2019, 8:08 p.m. UTC | #6
On 2019-03-17 22:59, Toke Høiland-Jørgensen wrote:
> Felix Fietkau <nbd@nbd.name> writes:
>> I've looked at ath9k (the only user at the moment), and it seems to call
>> the function in that way already: at PS wake or tx status time if it has
>> frames in its internal retry queue.
>> While it does not match the current documented behavior for that
>> function, it nicely fits ath9k's currently unfulfilled expectations ;)
> 
> Heh, fair point :)
I noticed another issue: after the migration to the mac80211 txq
scheduling code, ath9k does not handle stations going to powersave
properly anymore. mac80211 keeps returning txqs for stations that have
gone to sleep and ath9k will send out frames for them.

We could deal with this in the driver and simply not return queues for
stations in PS mode, or mac80211 could actively cancel them once a
station enters PS mode. What do you prefer?

- Felix
Toke Høiland-Jørgensen March 18, 2019, 10:14 p.m. UTC | #7
Felix Fietkau <nbd@nbd.name> writes:

> On 2019-03-17 22:59, Toke Høiland-Jørgensen wrote:
>> Felix Fietkau <nbd@nbd.name> writes:
>>> I've looked at ath9k (the only user at the moment), and it seems to call
>>> the function in that way already: at PS wake or tx status time if it has
>>> frames in its internal retry queue.
>>> While it does not match the current documented behavior for that
>>> function, it nicely fits ath9k's currently unfulfilled expectations ;)
>> 
>> Heh, fair point :)
> I noticed another issue: after the migration to the mac80211 txq
> scheduling code, ath9k does not handle stations going to powersave
> properly anymore. mac80211 keeps returning txqs for stations that have
> gone to sleep and ath9k will send out frames for them.

Ah, right. Never did have a good grip on the powersave code...

> We could deal with this in the driver and simply not return queues for
> stations in PS mode, or mac80211 could actively cancel them once a
> station enters PS mode. What do you prefer?

I think the cleanest would be if mac80211 handled it and just
un-scheduled stations when they go to sleep.

BTW, I was just thinking the other day about why the retry queue is kept
around when a station goes to sleep? Isn't the station usually sleeping
pretty long (>100 ms), where it might make more sense to drop things
rather than try again once i comes back?

-Toke
Felix Fietkau March 18, 2019, 10:37 p.m. UTC | #8
On 2019-03-18 23:14, Toke Høiland-Jørgensen wrote:
> Felix Fietkau <nbd@nbd.name> writes:
> 
>> On 2019-03-17 22:59, Toke Høiland-Jørgensen wrote:
>>> Felix Fietkau <nbd@nbd.name> writes:
>>>> I've looked at ath9k (the only user at the moment), and it seems to call
>>>> the function in that way already: at PS wake or tx status time if it has
>>>> frames in its internal retry queue.
>>>> While it does not match the current documented behavior for that
>>>> function, it nicely fits ath9k's currently unfulfilled expectations ;)
>>> 
>>> Heh, fair point :)
>> I noticed another issue: after the migration to the mac80211 txq
>> scheduling code, ath9k does not handle stations going to powersave
>> properly anymore. mac80211 keeps returning txqs for stations that have
>> gone to sleep and ath9k will send out frames for them.
> 
> Ah, right. Never did have a good grip on the powersave code...
> 
>> We could deal with this in the driver and simply not return queues for
>> stations in PS mode, or mac80211 could actively cancel them once a
>> station enters PS mode. What do you prefer?
> 
> I think the cleanest would be if mac80211 handled it and just
> un-scheduled stations when they go to sleep.
I agree. I'll send a patch tomorrow.
> BTW, I was just thinking the other day about why the retry queue is kept
> around when a station goes to sleep? Isn't the station usually sleeping
> pretty long (>100 ms), where it might make more sense to drop things
> rather than try again once i comes back?
It doesn't always sleep long. It might just be background scanning.
There's no way for the AP to know in advance.
In any case, the A-MPDU receiver reorder window still needs to be
maintained, so dropping frames means we'd have to send a BAR frame to
notify the receiver, which costs airtime as well.

- Felix
Toke Høiland-Jørgensen March 18, 2019, 11:05 p.m. UTC | #9
Felix Fietkau <nbd@nbd.name> writes:

> On 2019-03-18 23:14, Toke Høiland-Jørgensen wrote:
>> Felix Fietkau <nbd@nbd.name> writes:
>> 
>>> On 2019-03-17 22:59, Toke Høiland-Jørgensen wrote:
>>>> Felix Fietkau <nbd@nbd.name> writes:
>>>>> I've looked at ath9k (the only user at the moment), and it seems to call
>>>>> the function in that way already: at PS wake or tx status time if it has
>>>>> frames in its internal retry queue.
>>>>> While it does not match the current documented behavior for that
>>>>> function, it nicely fits ath9k's currently unfulfilled expectations ;)
>>>> 
>>>> Heh, fair point :)
>>> I noticed another issue: after the migration to the mac80211 txq
>>> scheduling code, ath9k does not handle stations going to powersave
>>> properly anymore. mac80211 keeps returning txqs for stations that have
>>> gone to sleep and ath9k will send out frames for them.
>> 
>> Ah, right. Never did have a good grip on the powersave code...
>> 
>>> We could deal with this in the driver and simply not return queues for
>>> stations in PS mode, or mac80211 could actively cancel them once a
>>> station enters PS mode. What do you prefer?
>> 
>> I think the cleanest would be if mac80211 handled it and just
>> un-scheduled stations when they go to sleep.
> I agree. I'll send a patch tomorrow.

Cool, thanks! :)

>> BTW, I was just thinking the other day about why the retry queue is kept
>> around when a station goes to sleep? Isn't the station usually sleeping
>> pretty long (>100 ms), where it might make more sense to drop things
>> rather than try again once i comes back?
> It doesn't always sleep long. It might just be background scanning.
> There's no way for the AP to know in advance.
> In any case, the A-MPDU receiver reorder window still needs to be
> maintained, so dropping frames means we'd have to send a BAR frame to
> notify the receiver, which costs airtime as well.

Right, makes sense; thanks for the explanation.

-Toke
diff mbox series

Patch

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 7b8a998103d7..09978757e7d1 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -184,9 +184,7 @@  mt76_dma_tx_cleanup(struct mt76_dev *dev, enum mt76_txq_id qid, bool flush)
 			last = ioread32(&q->regs->dma_idx);
 	}
 
-	if (!flush)
-		mt76_txq_schedule(dev, sq);
-	else
+	if (flush)
 		mt76_dma_sync_idx(dev, q);
 
 	wake = wake && q->stopped &&
@@ -199,6 +197,8 @@  mt76_dma_tx_cleanup(struct mt76_dev *dev, enum mt76_txq_id qid, bool flush)
 
 	spin_unlock_bh(&q->lock);
 
+	if (!flush)
+		mt76_txq_schedule(dev, qid);
 	if (wake)
 		ieee80211_wake_queue(dev->hw, qid);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index edff44f32c8e..5d44f721d184 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -224,7 +224,6 @@  struct mt76_wcid {
 };
 
 struct mt76_txq {
-	struct list_head list;
 	struct mt76_sw_queue *swq;
 	struct mt76_wcid *wcid;
 
@@ -683,7 +682,7 @@  void mt76_txq_remove(struct mt76_dev *dev, struct ieee80211_txq *txq);
 void mt76_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq);
 void mt76_stop_tx_queues(struct mt76_dev *dev, struct ieee80211_sta *sta,
 			 bool send_bar);
-void mt76_txq_schedule(struct mt76_dev *dev, struct mt76_sw_queue *sq);
+void mt76_txq_schedule(struct mt76_dev *dev, enum mt76_txq_id qid);
 void mt76_txq_schedule_all(struct mt76_dev *dev);
 void mt76_release_buffered_frames(struct ieee80211_hw *hw,
 				  struct ieee80211_sta *sta,
diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
index 2c82db0b5834..ef9a0bbd64c1 100644
--- a/drivers/net/wireless/mediatek/mt76/tx.c
+++ b/drivers/net/wireless/mediatek/mt76/tx.c
@@ -479,23 +479,37 @@  mt76_txq_send_burst(struct mt76_dev *dev, struct mt76_sw_queue *sq,
 }
 
 static int
-mt76_txq_schedule_list(struct mt76_dev *dev, struct mt76_sw_queue *sq)
+mt76_txq_schedule_list(struct mt76_dev *dev, enum mt76_txq_id qid)
 {
+	struct mt76_sw_queue *sq = &dev->q_tx[qid];
 	struct mt76_queue *hwq = sq->q;
-	struct mt76_txq *mtxq, *mtxq_last;
-	int len = 0;
+	struct ieee80211_txq *txq;
+	struct mt76_txq *mtxq;
+	struct mt76_wcid *wcid;
+	int ret = 0;
 
-restart:
-	mtxq_last = list_last_entry(&sq->swq, struct mt76_txq, list);
-	while (!list_empty(&sq->swq)) {
+	spin_lock_bh(&hwq->lock);
+	while (1) {
 		bool empty = false;
-		int cur;
+
+		if (sq->swq_queued >= 4)
+			break;
 
 		if (test_bit(MT76_OFFCHANNEL, &dev->state) ||
-		    test_bit(MT76_RESET, &dev->state))
-			return -EBUSY;
+		    test_bit(MT76_RESET, &dev->state)) {
+			ret = -EBUSY;
+			break;
+		}
+
+		txq = ieee80211_next_txq(dev->hw, qid);
+		if (!txq)
+			break;
+
+		mtxq = (struct mt76_txq *)txq->drv_priv;
+		wcid = mtxq->wcid;
+		if (wcid && test_bit(MT_WCID_FLAG_PS, &wcid->flags))
+			continue;
 
-		mtxq = list_first_entry(&sq->swq, struct mt76_txq, list);
 		if (mtxq->send_bar && mtxq->aggr) {
 			struct ieee80211_txq *txq = mtxq_to_txq(mtxq);
 			struct ieee80211_sta *sta = txq->sta;
@@ -507,38 +521,36 @@  mt76_txq_schedule_list(struct mt76_dev *dev, struct mt76_sw_queue *sq)
 			spin_unlock_bh(&hwq->lock);
 			ieee80211_send_bar(vif, sta->addr, tid, agg_ssn);
 			spin_lock_bh(&hwq->lock);
-			goto restart;
 		}
 
-		list_del_init(&mtxq->list);
-
-		cur = mt76_txq_send_burst(dev, sq, mtxq, &empty);
+		ret += mt76_txq_send_burst(dev, sq, mtxq, &empty);
 		if (!empty)
-			list_add_tail(&mtxq->list, &sq->swq);
-
-		if (cur < 0)
-			return cur;
-
-		len += cur;
-
-		if (mtxq == mtxq_last)
-			break;
+			ieee80211_return_txq(dev->hw, txq);
 	}
+	spin_unlock_bh(&hwq->lock);
 
-	return len;
+	return ret;
 }
 
-void mt76_txq_schedule(struct mt76_dev *dev, struct mt76_sw_queue *sq)
+void mt76_txq_schedule(struct mt76_dev *dev, enum mt76_txq_id qid)
 {
+	struct mt76_sw_queue *sq = &dev->q_tx[qid];
 	int len;
 
+	if (qid >= 4)
+		return;
+
+	if (sq->swq_queued >= 4)
+		return;
+
 	rcu_read_lock();
-	do {
-		if (sq->swq_queued >= 4 || list_empty(&sq->swq))
-			break;
 
-		len = mt76_txq_schedule_list(dev, sq);
+	do {
+		ieee80211_txq_schedule_start(dev->hw, qid);
+		len = mt76_txq_schedule_list(dev, qid);
+		ieee80211_txq_schedule_end(dev->hw, qid);
 	} while (len > 0);
+
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(mt76_txq_schedule);
@@ -547,13 +559,8 @@  void mt76_txq_schedule_all(struct mt76_dev *dev)
 {
 	int i;
 
-	for (i = 0; i <= MT_TXQ_BK; i++) {
-		struct mt76_queue *q = dev->q_tx[i].q;
-
-		spin_lock_bh(&q->lock);
-		mt76_txq_schedule(dev, &dev->q_tx[i]);
-		spin_unlock_bh(&q->lock);
-	}
+	for (i = 0; i <= MT_TXQ_BK; i++)
+		mt76_txq_schedule(dev, i);
 }
 EXPORT_SYMBOL_GPL(mt76_txq_schedule_all);
 
@@ -575,8 +582,6 @@  void mt76_stop_tx_queues(struct mt76_dev *dev, struct ieee80211_sta *sta,
 
 		spin_lock_bh(&hwq->lock);
 		mtxq->send_bar = mtxq->aggr && send_bar;
-		if (!list_empty(&mtxq->list))
-			list_del_init(&mtxq->list);
 		spin_unlock_bh(&hwq->lock);
 	}
 }
@@ -585,24 +590,16 @@  EXPORT_SYMBOL_GPL(mt76_stop_tx_queues);
 void mt76_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq)
 {
 	struct mt76_dev *dev = hw->priv;
-	struct mt76_txq *mtxq = (struct mt76_txq *)txq->drv_priv;
-	struct mt76_sw_queue *sq = mtxq->swq;
-	struct mt76_queue *hwq = sq->q;
 
 	if (!test_bit(MT76_STATE_RUNNING, &dev->state))
 		return;
 
-	spin_lock_bh(&hwq->lock);
-	if (list_empty(&mtxq->list))
-		list_add_tail(&mtxq->list, &sq->swq);
-	mt76_txq_schedule(dev, sq);
-	spin_unlock_bh(&hwq->lock);
+	mt76_txq_schedule(dev, txq->ac);
 }
 EXPORT_SYMBOL_GPL(mt76_wake_tx_queue);
 
 void mt76_txq_remove(struct mt76_dev *dev, struct ieee80211_txq *txq)
 {
-	struct mt76_queue *hwq;
 	struct mt76_txq *mtxq;
 	struct sk_buff *skb;
 
@@ -610,12 +607,6 @@  void mt76_txq_remove(struct mt76_dev *dev, struct ieee80211_txq *txq)
 		return;
 
 	mtxq = (struct mt76_txq *) txq->drv_priv;
-	hwq = mtxq->swq->q;
-
-	spin_lock_bh(&hwq->lock);
-	if (!list_empty(&mtxq->list))
-		list_del_init(&mtxq->list);
-	spin_unlock_bh(&hwq->lock);
 
 	while ((skb = skb_dequeue(&mtxq->retry_q)) != NULL)
 		ieee80211_free_txskb(dev->hw, skb);
@@ -626,7 +617,6 @@  void mt76_txq_init(struct mt76_dev *dev, struct ieee80211_txq *txq)
 {
 	struct mt76_txq *mtxq = (struct mt76_txq *) txq->drv_priv;
 
-	INIT_LIST_HEAD(&mtxq->list);
 	skb_queue_head_init(&mtxq->retry_q);
 
 	mtxq->swq = &dev->q_tx[mt76_txq_get_qid(txq)];
diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
index 27896a435d6c..2dbd8dfd62a0 100644
--- a/drivers/net/wireless/mediatek/mt76/usb.c
+++ b/drivers/net/wireless/mediatek/mt76/usb.c
@@ -656,7 +656,6 @@  static void mt76u_tx_tasklet(unsigned long data)
 			dev->drv->tx_complete_skb(dev, i, &entry);
 			spin_lock_bh(&q->lock);
 		}
-		mt76_txq_schedule(dev, sq);
 
 		wake = q->stopped && q->queued < q->ndesc - 8;
 		if (wake)
@@ -667,6 +666,8 @@  static void mt76u_tx_tasklet(unsigned long data)
 
 		spin_unlock_bh(&q->lock);
 
+		mt76_txq_schedule(dev, i);
+
 		if (!test_and_set_bit(MT76_READING_STATS, &dev->state))
 			ieee80211_queue_delayed_work(dev->hw,
 						     &dev->usb.stat_work,