diff mbox series

[v2] wifi: mt76: fix potential memory leakage

Message ID 0a218783098a6b1b90ff120c7182acee93dc8feb.1671423784.git.Bo.Jiao@mediatek.com (mailing list archive)
State Superseded
Delegated to: Felix Fietkau
Headers show
Series [v2] wifi: mt76: fix potential memory leakage | expand

Commit Message

Bo Jiao Dec. 19, 2022, 4:48 a.m. UTC
From: Bo Jiao <Bo.Jiao@mediatek.com>

fix potential memory leakage, recycle rxwi when mt76_dma_add_buf() call fail.

Signed-off-by: Bo Jiao <Bo.Jiao@mediatek.com>
---
v2:
- recycle rxwi when page_frag_alloc() and dma_map_single() fail.
---
 drivers/net/wireless/mediatek/mt76/dma.c | 27 ++++++++++++++----------
 1 file changed, 16 insertions(+), 11 deletions(-)

Comments

Lorenzo Bianconi Dec. 19, 2022, 9:55 a.m. UTC | #1
> From: Bo Jiao <Bo.Jiao@mediatek.com>
> 
> fix potential memory leakage, recycle rxwi when mt76_dma_add_buf() call fail.
> 
> Signed-off-by: Bo Jiao <Bo.Jiao@mediatek.com>
> ---
> v2:
> - recycle rxwi when page_frag_alloc() and dma_map_single() fail.
> ---
>  drivers/net/wireless/mediatek/mt76/dma.c | 27 ++++++++++++++----------
>  1 file changed, 16 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
> index fc24b35..76ad47d 100644
> --- a/drivers/net/wireless/mediatek/mt76/dma.c
> +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> @@ -580,24 +580,29 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
>  
>  		buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
>  		if (!buf)
> -			break;
> +			goto out;
>  
>  		addr = dma_map_single(dev->dma_dev, buf, len, DMA_FROM_DEVICE);
> -		if (unlikely(dma_mapping_error(dev->dma_dev, addr))) {
> -			skb_free_frag(buf);
> -			break;
> -		}
> +		if (unlikely(dma_mapping_error(dev->dma_dev, addr)))
> +			goto free;
>  
>  		qbuf.addr = addr + offset;
>  		qbuf.len = len - offset;
>  		qbuf.skip_unmap = false;
> -		if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) {
> -			dma_unmap_single(dev->dma_dev, addr, len,
> -					 DMA_FROM_DEVICE);
> -			skb_free_frag(buf);
> -			break;
> -		}
> +		if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0)
> +			goto umap;
> +
>  		frames++;
> +		continue;
> +
> +umap:
> +		dma_unmap_single(dev->dma_dev, addr, len,
> +				 DMA_FROM_DEVICE);
> +free:
> +		skb_free_frag(buf);
> +out:
> +		mt76_put_rxwi(dev, t);
> +		break;
>  	}
>  
>  	if (frames)

Hi Bo,

I guess in the way below, the code is more readable, what do you think?

Regards,
Lorenzo

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index fad5fe19fe18..001538f698f1 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -571,13 +571,6 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
 		struct mt76_queue_buf qbuf;
 		void *buf = NULL;
 
-		if ((q->flags & MT_QFLAG_WED) &&
-		    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
-			t = mt76_get_rxwi(dev);
-			if (!t)
-				break;
-		}
-
 		buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
 		if (!buf)
 			break;
@@ -588,16 +581,27 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
 			break;
 		}
 
+		if ((q->flags & MT_QFLAG_WED) &&
+		    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
+			t = mt76_get_rxwi(dev);
+			if (!t)
+				goto unmap;
+		}
+
 		qbuf.addr = addr + offset;
 		qbuf.len = len - offset;
 		qbuf.skip_unmap = false;
-		if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) {
-			dma_unmap_single(dev->dma_dev, addr, len,
-					 DMA_FROM_DEVICE);
-			skb_free_frag(buf);
-			break;
+
+		if (!mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t)) {
+			frames++;
+			continue;
 		}
-		frames++;
+
+unmap:
+		dma_unmap_single(dev->dma_dev, addr, len, DMA_FROM_DEVICE);
+		skb_free_frag(buf);
+		mt76_put_rxwi(dev, t);
+		break;
 	}
 
 	if (frames)

> -- 
> 2.18.0
>
Sujuan Chen Dec. 20, 2022, 4:42 a.m. UTC | #2
On Mon, 2022-12-19 at 10:55 +0100, Lorenzo Bianconi wrote:
> > From: Bo Jiao <Bo.Jiao@mediatek.com>
> > 
> > fix potential memory leakage, recycle rxwi when mt76_dma_add_buf()
> > call fail.
> > 
> > Signed-off-by: Bo Jiao <Bo.Jiao@mediatek.com>
> > ---
> > v2:
> > - recycle rxwi when page_frag_alloc() and dma_map_single() fail.
> > ---
> >  drivers/net/wireless/mediatek/mt76/dma.c | 27 ++++++++++++++----
> > ------
> >  1 file changed, 16 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> > b/drivers/net/wireless/mediatek/mt76/dma.c
> > index fc24b35..76ad47d 100644
> > --- a/drivers/net/wireless/mediatek/mt76/dma.c
> > +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> > @@ -580,24 +580,29 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> > mt76_queue *q)
> >  
> >  		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> > GFP_ATOMIC);
> >  		if (!buf)
> > -			break;
> > +			goto out;
> >  
> >  		addr = dma_map_single(dev->dma_dev, buf, len,
> > DMA_FROM_DEVICE);
> > -		if (unlikely(dma_mapping_error(dev->dma_dev, addr))) {
> > -			skb_free_frag(buf);
> > -			break;
> > -		}
> > +		if (unlikely(dma_mapping_error(dev->dma_dev, addr)))
> > +			goto free;
> >  
> >  		qbuf.addr = addr + offset;
> >  		qbuf.len = len - offset;
> >  		qbuf.skip_unmap = false;
> > -		if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0)
> > {
> > -			dma_unmap_single(dev->dma_dev, addr, len,
> > -					 DMA_FROM_DEVICE);
> > -			skb_free_frag(buf);
> > -			break;
> > -		}
> > +		if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0)
> > +			goto umap;
> > +
> >  		frames++;
> > +		continue;
> > +
> > +umap:
> > +		dma_unmap_single(dev->dma_dev, addr, len,
> > +				 DMA_FROM_DEVICE);
> > +free:
> > +		skb_free_frag(buf);
> > +out:
> > +		mt76_put_rxwi(dev, t);
> > +		break;
> >  	}
> >  
> >  	if (frames)
> 
> Hi Bo,
> 
> I guess in the way below, the code is more readable, what do you
> think?
> 
> Regards,
> Lorenzo
> 
> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> b/drivers/net/wireless/mediatek/mt76/dma.c
> index fad5fe19fe18..001538f698f1 100644
> --- a/drivers/net/wireless/mediatek/mt76/dma.c
> +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> @@ -571,13 +571,6 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> mt76_queue *q)
>  		struct mt76_queue_buf qbuf;
>  		void *buf = NULL;
>  
> -		if ((q->flags & MT_QFLAG_WED) &&
> -		    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) ==
> MT76_WED_Q_RX) {
> -			t = mt76_get_rxwi(dev);
> -			if (!t)
> -				break;
> -		}
> -
>  		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> GFP_ATOMIC);
>  		if (!buf)
>  			break;
> @@ -588,16 +581,27 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> mt76_queue *q)
>  			break;
>  		}
>  
> +		if ((q->flags & MT_QFLAG_WED) &&
> +		    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) ==
> MT76_WED_Q_RX) {
> +			t = mt76_get_rxwi(dev);
> +			if (!t)
> +				goto unmap;
> +		}
> +
>  		qbuf.addr = addr + offset;
>  		qbuf.len = len - offset;
>  		qbuf.skip_unmap = false;
> -		if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0)
> {
> -			dma_unmap_single(dev->dma_dev, addr, len,
> -					 DMA_FROM_DEVICE);
> -			skb_free_frag(buf);
> -			break;
> +
> +		if (!mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t)) {
> +			frames++;
> +			continue;
>  		}
> -		frames++;
> +
> +unmap:
> +		dma_unmap_single(dev->dma_dev, addr, len,
> DMA_FROM_DEVICE);
> +		skb_free_frag(buf);
> +		mt76_put_rxwi(dev, t);
> +		break;
>  	}
>  
>  	if (frames)
> 
Hi Lore,

we love your patch, but we have another patch to avoid memory
fragmentation by duplicating the rx skb after mt76_dma_dequeue(). it
requires mt76_get_rxwi() be placed before page_frag_alloc().

the below patch(need rebase) will be sent after the current patch is
accepted.

--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -386,9 +386,11 @@ mt76_dma_get_buf(struct mt76_dev *dev, struct
mt76_queue *q, int idx,
                                 SKB_WITH_OVERHEAD(q->buf_size),
                                 DMA_FROM_DEVICE);

-               buf = t->ptr;
+               buf = page_frag_alloc(&q->rx_page, q->buf_size,
GFP_ATOMIC);
+               if (!buf)
+                       return NULL;
+               memcpy(buf, t->ptr, SKB_WITH_OVERHEAD(q->buf_size));
                t->dma_addr = 0;
-               t->ptr = NULL;

                mt76_put_rxwi(dev, t);

@@ -569,6 +571,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
mt76_queue *q)
        while (q->queued < q->ndesc - 1) {
                struct mt76_txwi_cache *t = NULL;
                struct mt76_queue_buf qbuf;
+               bool skip_alloc = false;
                void *buf = NULL;

                if ((q->flags & MT_QFLAG_WED) &&
@@ -576,11 +579,19 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
mt76_queue *q)
                        t = mt76_get_rxwi(dev);
                        if (!t)
                                break;
+
+                       if (t->ptr) {
+                               skip_alloc = true;
+                               buf = t->ptr;
+                       }
                }

-               buf = page_frag_alloc(&q->rx_page, q->buf_size,
GFP_ATOMIC);
-               if (!buf)
-                       break;
+               if (!skip_alloc) {
+                       buf = page_frag_alloc(&q->rx_page, q->buf_size,
+                                             GFP_ATOMIC);
+                       if (!buf)
+                               break;
+               }

                addr = dma_map_single(dev->dma_dev, buf, len,
DMA_FROM_DEVICE);
                if (unlikely(dma_mapping_error(dev->dma_dev, addr))) {
--
2.18.0

> > -- 
> > 2.18.0
> >
Lorenzo Bianconi Dec. 21, 2022, 10:26 a.m. UTC | #3
[...]

> Hi Lore,

Hi Sujuan,

> 
> we love your patch, but we have another patch to avoid memory
> fragmentation by duplicating the rx skb after mt76_dma_dequeue(). it
> requires mt76_get_rxwi() be placed before page_frag_alloc().
> 
> the below patch(need rebase) will be sent after the current patch is
> accepted.
> 
> --- a/drivers/net/wireless/mediatek/mt76/dma.c
> +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> @@ -386,9 +386,11 @@ mt76_dma_get_buf(struct mt76_dev *dev, struct
> mt76_queue *q, int idx,
>                                  SKB_WITH_OVERHEAD(q->buf_size),
>                                  DMA_FROM_DEVICE);
> 
> -               buf = t->ptr;
> +               buf = page_frag_alloc(&q->rx_page, q->buf_size,
> GFP_ATOMIC);
> +               if (!buf)
> +                       return NULL;
> +               memcpy(buf, t->ptr, SKB_WITH_OVERHEAD(q->buf_size));

We this approach we still need to allocate the buffer (in mt76_dma_get_buf()
instead of mt76_dma_rx_fill()) but we need even to copy the full buffer that
can be pretty expensive instead of relying on the DMA, so I would avoid this
approach.

Regards,
Lorenzo

>                 t->dma_addr = 0;
> -               t->ptr = NULL;
> 
>                 mt76_put_rxwi(dev, t);
> 
> @@ -569,6 +571,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> mt76_queue *q)
>         while (q->queued < q->ndesc - 1) {
>                 struct mt76_txwi_cache *t = NULL;
>                 struct mt76_queue_buf qbuf;
> +               bool skip_alloc = false;
>                 void *buf = NULL;
> 
>                 if ((q->flags & MT_QFLAG_WED) &&
> @@ -576,11 +579,19 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> mt76_queue *q)
>                         t = mt76_get_rxwi(dev);
>                         if (!t)
>                                 break;
> +
> +                       if (t->ptr) {
> +                               skip_alloc = true;
> +                               buf = t->ptr;
> +                       }
>                 }
> 
> -               buf = page_frag_alloc(&q->rx_page, q->buf_size,
> GFP_ATOMIC);
> -               if (!buf)
> -                       break;
> +               if (!skip_alloc) {
> +                       buf = page_frag_alloc(&q->rx_page, q->buf_size,
> +                                             GFP_ATOMIC);
> +                       if (!buf)
> +                               break;
> +               }
> 
>                 addr = dma_map_single(dev->dma_dev, buf, len,
> DMA_FROM_DEVICE);
>                 if (unlikely(dma_mapping_error(dev->dma_dev, addr))) {
> --
> 2.18.0
> 
> > > -- 
> > > 2.18.0
> > >
Sujuan Chen Dec. 21, 2022, 2:48 p.m. UTC | #4
On Wed, 2022-12-21 at 11:26 +0100, lorenzo@kernel.org wrote:
> [...]
> 
> > Hi Lore,
> 
> Hi Sujuan,
> 
> > 
> > we love your patch, but we have another patch to avoid memory
> > fragmentation by duplicating the rx skb after mt76_dma_dequeue().
> > it
> > requires mt76_get_rxwi() be placed before page_frag_alloc().
> > 
> > the below patch(need rebase) will be sent after the current patch
> > is
> > accepted.
> > 
> > --- a/drivers/net/wireless/mediatek/mt76/dma.c
> > +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> > @@ -386,9 +386,11 @@ mt76_dma_get_buf(struct mt76_dev *dev, struct
> > mt76_queue *q, int idx,
> >                                  SKB_WITH_OVERHEAD(q->buf_size),
> >                                  DMA_FROM_DEVICE);
> > 
> > -               buf = t->ptr;
> > +               buf = page_frag_alloc(&q->rx_page, q->buf_size,
> > GFP_ATOMIC);
> > +               if (!buf)
> > +                       return NULL;
> > +               memcpy(buf, t->ptr, SKB_WITH_OVERHEAD(q-
> > >buf_size));
> 
> We this approach we still need to allocate the buffer (in
> mt76_dma_get_buf()
> instead of mt76_dma_rx_fill()) but we need even to copy the full
> buffer that
> can be pretty expensive instead of relying on the DMA, so I would
> avoid this
> approach.
> 

Hi Lore,

Yes, it is so expensive, but if no memcopy, it will casue memory
fragmentation (we hit this issue in internal SQC).

as we know, wed needs to exchange rx pkt(belongs to wed rx buffer
manager) with wifi driver(dma rx data queue) by exchanging wfdma dmad
to ensure the free wed rx buffer.

it is possiable that a large number of buffer has been exchanged to wed
and can not come back to wlan driver. So, the memory from the same 32K
page cache is unable to be released, and it will be failed at
page_frag_alloc in mt76_dma_rx_fill.

Any ideas but memcopy?

Regards,
Sujuan

> Regards,
> Lorenzo
> 
> >                 t->dma_addr = 0;
> > -               t->ptr = NULL;
> > 
> >                 mt76_put_rxwi(dev, t);
> > 
> > @@ -569,6 +571,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> > mt76_queue *q)
> >         while (q->queued < q->ndesc - 1) {
> >                 struct mt76_txwi_cache *t = NULL;
> >                 struct mt76_queue_buf qbuf;
> > +               bool skip_alloc = false;
> >                 void *buf = NULL;
> > 
> >                 if ((q->flags & MT_QFLAG_WED) &&
> > @@ -576,11 +579,19 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> > mt76_queue *q)
> >                         t = mt76_get_rxwi(dev);
> >                         if (!t)
> >                                 break;
> > +
> > +                       if (t->ptr) {
> > +                               skip_alloc = true;
> > +                               buf = t->ptr;
> > +                       }
> >                 }
> > 
> > -               buf = page_frag_alloc(&q->rx_page, q->buf_size,
> > GFP_ATOMIC);
> > -               if (!buf)
> > -                       break;
> > +               if (!skip_alloc) {
> > +                       buf = page_frag_alloc(&q->rx_page, q-
> > >buf_size,
> > +                                             GFP_ATOMIC);
> > +                       if (!buf)
> > +                               break;
> > +               }
> > 
> >                 addr = dma_map_single(dev->dma_dev, buf, len,
> > DMA_FROM_DEVICE);
> >                 if (unlikely(dma_mapping_error(dev->dma_dev,
> > addr))) {
> > --
> > 2.18.0
> > 
> > > > -- 
> > > > 2.18.0
> > > >
Felix Fietkau Dec. 21, 2022, 3:12 p.m. UTC | #5
Hi Sujuan,

> Yes, it is so expensive, but if no memcopy, it will casue memory
> fragmentation (we hit this issue in internal SQC).
> 
> as we know, wed needs to exchange rx pkt(belongs to wed rx buffer
> manager) with wifi driver(dma rx data queue) by exchanging wfdma dmad
> to ensure the free wed rx buffer.
> 
> it is possiable that a large number of buffer has been exchanged to wed
> and can not come back to wlan driver. So, the memory from the same 32K
> page cache is unable to be released, and it will be failed at
> page_frag_alloc in mt76_dma_rx_fill.
> 
> Any ideas but memcopy?
A simple solution would be to simply allocate single pages, or half-page fragments.

- Felix
Lorenzo Bianconi Dec. 21, 2022, 6:02 p.m. UTC | #6
On Dec 21, Felix Fietkau wrote:
> Hi Sujuan,
> 
> > Yes, it is so expensive, but if no memcopy, it will casue memory
> > fragmentation (we hit this issue in internal SQC).
> > 
> > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer
> > manager) with wifi driver(dma rx data queue) by exchanging wfdma dmad
> > to ensure the free wed rx buffer.
> > 
> > it is possiable that a large number of buffer has been exchanged to wed
> > and can not come back to wlan driver. So, the memory from the same 32K
> > page cache is unable to be released, and it will be failed at
> > page_frag_alloc in mt76_dma_rx_fill.
> > 
> > Any ideas but memcopy?
> A simple solution would be to simply allocate single pages, or half-page fragments.
> 
> - Felix
> 

A simple approach would be allocating a single page for each rx buffer.

@Sujuan: can you please double check if it is ok from performance and memory
	 fragmentation point of view? If not I guess we can try to optimize it
	 and allocate multiple buffers in the same page tweeking page refcount.

(this patch must be applied on top of Felix's dma fix).

Regards,
Lorenzo

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 28a7fe064313..1d9e580977fc 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, struct mt76_queue *q,
 	return ret;
 }
 
+static void *
+mt76_dma_get_rx_buf(struct mt76_queue *q)
+{
+	if ((q->flags & MT_QFLAG_WED) &&
+	    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
+		/* WED RX queue */
+		struct page *page = dev_alloc_page();
+
+		return page ? page_address(page) : NULL;
+	}
+
+	return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
+}
+
 static int
 mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
 {
@@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
 		struct mt76_queue_buf qbuf;
 		void *buf = NULL;
 
-		buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
+		buf = mt76_dma_get_rx_buf(q);
 		if (!buf)
 			break;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
index 1a2e4df8d1b5..2924e71e4fbe 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
@@ -594,13 +594,9 @@ static void mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed)
 static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device *wed)
 {
 	struct mt7915_dev *dev;
-	u32 length;
 	int i;
 
 	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
-	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
-				sizeof(struct skb_shared_info));
-
 	for (i = 0; i < dev->mt76.rx_token_size; i++) {
 		struct mt76_txwi_cache *t;
 
@@ -610,7 +606,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device *wed)
 
 		dma_unmap_single(dev->mt76.dma_dev, t->dma_addr,
 				 wed->wlan.rx_size, DMA_FROM_DEVICE);
-		__free_pages(virt_to_page(t->ptr), get_order(length));
+		free_page(virt_to_page(t->ptr));
 		t->ptr = NULL;
 
 		mt76_put_rxwi(&dev->mt76, t);
@@ -621,13 +617,9 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size)
 {
 	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
 	struct mt7915_dev *dev;
-	u32 length;
 	int i;
 
 	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
-	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
-				sizeof(struct skb_shared_info));
-
 	for (i = 0; i < size; i++) {
 		struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76);
 		dma_addr_t phy_addr;
@@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size)
 		int token;
 		void *ptr;
 
-		page = __dev_alloc_pages(GFP_KERNEL, get_order(length));
+		page = __dev_alloc_page(GFP_KERNEL);
 		if (!page)
 			goto unmap;
 
@@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size)
 					  wed->wlan.rx_size,
 					  DMA_TO_DEVICE);
 		if (unlikely(dma_mapping_error(dev->mt76.dev, phy_addr))) {
-			__free_pages(page, get_order(length));
+			free_page(page);
 			goto unmap;
 		}
 
@@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size)
 		if (token < 0) {
 			dma_unmap_single(dev->mt76.dma_dev, phy_addr,
 					 wed->wlan.rx_size, DMA_TO_DEVICE);
-			__free_pages(page, get_order(length));
+			free_page(page);
 			goto unmap;
 		}
Sujuan Chen Dec. 22, 2022, 8:51 a.m. UTC | #7
On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote:
> On Dec 21, Felix Fietkau wrote:
> > Hi Sujuan,
> > 
> > > Yes, it is so expensive, but if no memcopy, it will casue memory
> > > fragmentation (we hit this issue in internal SQC).
> > > 
> > > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer
> > > manager) with wifi driver(dma rx data queue) by exchanging wfdma
> > > dmad
> > > to ensure the free wed rx buffer.
> > > 
> > > it is possiable that a large number of buffer has been exchanged
> > > to wed
> > > and can not come back to wlan driver. So, the memory from the
> > > same 32K
> > > page cache is unable to be released, and it will be failed at
> > > page_frag_alloc in mt76_dma_rx_fill.
> > > 
> > > Any ideas but memcopy?
> > 
> > A simple solution would be to simply allocate single pages, or
> > half-page fragments.
> > 
> > - Felix
> > 
> 
> A simple approach would be allocating a single page for each rx
> buffer.
> 
> @Sujuan: can you please double check if it is ok from performance and
> memory
> 	 fragmentation point of view? If not I guess we can try to
> optimize it
> 	 and allocate multiple buffers in the same page tweeking page
> refcount.
> 
> (this patch must be applied on top of Felix's dma fix).
> 

Allocating single page for each rx buffer avoids memory fragmentation,
but it always uses 4K for one rx pkt which only needs 2K, right?

I guess performance would be worse without page cache.
We have tested on the mtk private driver, 7% drop in throughput when
setting the 4K page cache compared to the 32K page cache.
and 10% drop when use slab to allocate buffer.

A single page per rx buffer may cause a throughput drop of over 7% and
waste memory, what do you think?

Regards,
Sujuan

> Regards,
> Lorenzo
> 
> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> b/drivers/net/wireless/mediatek/mt76/dma.c
> index 28a7fe064313..1d9e580977fc 100644
> --- a/drivers/net/wireless/mediatek/mt76/dma.c
> +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev,
> struct mt76_queue *q,
>  	return ret;
>  }
>  
> +static void *
> +mt76_dma_get_rx_buf(struct mt76_queue *q)
> +{
> +	if ((q->flags & MT_QFLAG_WED) &&
> +	    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
> +		/* WED RX queue */
> +		struct page *page = dev_alloc_page();
> +
> +		return page ? page_address(page) : NULL;
> +	}
> +
> +	return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
> +}
> +
>  static int
>  mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
>  {
> @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> mt76_queue *q)
>  		struct mt76_queue_buf qbuf;
>  		void *buf = NULL;
>  
> -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> GFP_ATOMIC);
> +		buf = mt76_dma_get_rx_buf(q);
>  		if (!buf)
>  			break;
>  
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> index 1a2e4df8d1b5..2924e71e4fbe 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> @@ -594,13 +594,9 @@ static void
> mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed)
>  static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device
> *wed)
>  {
>  	struct mt7915_dev *dev;
> -	u32 length;
>  	int i;
>  
>  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> -				sizeof(struct skb_shared_info));
> -
>  	for (i = 0; i < dev->mt76.rx_token_size; i++) {
>  		struct mt76_txwi_cache *t;
>  
> @@ -610,7 +606,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct
> mtk_wed_device *wed)
>  
>  		dma_unmap_single(dev->mt76.dma_dev, t->dma_addr,
>  				 wed->wlan.rx_size, DMA_FROM_DEVICE);
> -		__free_pages(virt_to_page(t->ptr), get_order(length));
> +		free_page(virt_to_page(t->ptr));
>  		t->ptr = NULL;
>  
>  		mt76_put_rxwi(&dev->mt76, t);
> @@ -621,13 +617,9 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> mtk_wed_device *wed, int size)
>  {
>  	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
>  	struct mt7915_dev *dev;
> -	u32 length;
>  	int i;
>  
>  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> -				sizeof(struct skb_shared_info));
> -
>  	for (i = 0; i < size; i++) {
>  		struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76);
>  		dma_addr_t phy_addr;
> @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> mtk_wed_device *wed, int size)
>  		int token;
>  		void *ptr;
>  
> -		page = __dev_alloc_pages(GFP_KERNEL,
> get_order(length));
> +		page = __dev_alloc_page(GFP_KERNEL);
>  		if (!page)
>  			goto unmap;
>  
> @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> mtk_wed_device *wed, int size)
>  					  wed->wlan.rx_size,
>  					  DMA_TO_DEVICE);
>  		if (unlikely(dma_mapping_error(dev->mt76.dev,
> phy_addr))) {
> -			__free_pages(page, get_order(length));
> +			free_page(page);
>  			goto unmap;
>  		}
>  
> @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> mtk_wed_device *wed, int size)
>  		if (token < 0) {
>  			dma_unmap_single(dev->mt76.dma_dev, phy_addr,
>  					 wed->wlan.rx_size,
> DMA_TO_DEVICE);
> -			__free_pages(page, get_order(length));
> +			free_page(page);
>  			goto unmap;
>  		}
>
Lorenzo Bianconi Dec. 22, 2022, 9:44 a.m. UTC | #8
> On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote:
> > On Dec 21, Felix Fietkau wrote:
> > > Hi Sujuan,
> > > 
> > > > Yes, it is so expensive, but if no memcopy, it will casue memory
> > > > fragmentation (we hit this issue in internal SQC).
> > > > 
> > > > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer
> > > > manager) with wifi driver(dma rx data queue) by exchanging wfdma
> > > > dmad
> > > > to ensure the free wed rx buffer.
> > > > 
> > > > it is possiable that a large number of buffer has been exchanged
> > > > to wed
> > > > and can not come back to wlan driver. So, the memory from the
> > > > same 32K
> > > > page cache is unable to be released, and it will be failed at
> > > > page_frag_alloc in mt76_dma_rx_fill.
> > > > 
> > > > Any ideas but memcopy?
> > > 
> > > A simple solution would be to simply allocate single pages, or
> > > half-page fragments.
> > > 
> > > - Felix
> > > 
> > 
> > A simple approach would be allocating a single page for each rx
> > buffer.
> > 
> > @Sujuan: can you please double check if it is ok from performance and
> > memory
> > 	 fragmentation point of view? If not I guess we can try to
> > optimize it
> > 	 and allocate multiple buffers in the same page tweeking page
> > refcount.
> > 
> > (this patch must be applied on top of Felix's dma fix).
> > 
> 
> Allocating single page for each rx buffer avoids memory fragmentation,
> but it always uses 4K for one rx pkt which only needs 2K, right?

correct, we can optimize it allocating multiple buffers (in this case 2,
assuming 4K pages) in a single page and recycling the page.

> 
> I guess performance would be worse without page cache.

I think it is a trade-off

> We have tested on the mtk private driver, 7% drop in throughput when
> setting the 4K page cache compared to the 32K page cache.
> and 10% drop when use slab to allocate buffer.

IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or with a private
page_frag_alloc() implementation) and you avoided memory allocation
failures due to fragmentation but you got ~ 7% drop in throughput, correct?
I think this is quite expected since we need to refill ~ 8 times more the
page cache.

Not considering memory fragmentation, have you measured the impact of the
memcpy of a full buffer?

> 
> A single page per rx buffer may cause a throughput drop of over 7% and
> waste memory, what do you think?

Implementing the page recycles as it is done in page_frag_alloc() we should get
the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to 4K.

Regards,
Lorenzo

> 
> Regards,
> Sujuan
> 
> > Regards,
> > Lorenzo
> > 
> > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> > b/drivers/net/wireless/mediatek/mt76/dma.c
> > index 28a7fe064313..1d9e580977fc 100644
> > --- a/drivers/net/wireless/mediatek/mt76/dma.c
> > +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev,
> > struct mt76_queue *q,
> >  	return ret;
> >  }
> >  
> > +static void *
> > +mt76_dma_get_rx_buf(struct mt76_queue *q)
> > +{
> > +	if ((q->flags & MT_QFLAG_WED) &&
> > +	    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
> > +		/* WED RX queue */
> > +		struct page *page = dev_alloc_page();
> > +
> > +		return page ? page_address(page) : NULL;
> > +	}
> > +
> > +	return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
> > +}
> > +
> >  static int
> >  mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
> >  {
> > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> > mt76_queue *q)
> >  		struct mt76_queue_buf qbuf;
> >  		void *buf = NULL;
> >  
> > -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> > GFP_ATOMIC);
> > +		buf = mt76_dma_get_rx_buf(q);
> >  		if (!buf)
> >  			break;
> >  
> > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > index 1a2e4df8d1b5..2924e71e4fbe 100644
> > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > @@ -594,13 +594,9 @@ static void
> > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed)
> >  static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device
> > *wed)
> >  {
> >  	struct mt7915_dev *dev;
> > -	u32 length;
> >  	int i;
> >  
> >  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> > -				sizeof(struct skb_shared_info));
> > -
> >  	for (i = 0; i < dev->mt76.rx_token_size; i++) {
> >  		struct mt76_txwi_cache *t;
> >  
> > @@ -610,7 +606,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct
> > mtk_wed_device *wed)
> >  
> >  		dma_unmap_single(dev->mt76.dma_dev, t->dma_addr,
> >  				 wed->wlan.rx_size, DMA_FROM_DEVICE);
> > -		__free_pages(virt_to_page(t->ptr), get_order(length));
> > +		free_page(virt_to_page(t->ptr));
> >  		t->ptr = NULL;
> >  
> >  		mt76_put_rxwi(&dev->mt76, t);
> > @@ -621,13 +617,9 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > mtk_wed_device *wed, int size)
> >  {
> >  	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
> >  	struct mt7915_dev *dev;
> > -	u32 length;
> >  	int i;
> >  
> >  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> > -				sizeof(struct skb_shared_info));
> > -
> >  	for (i = 0; i < size; i++) {
> >  		struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76);
> >  		dma_addr_t phy_addr;
> > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > mtk_wed_device *wed, int size)
> >  		int token;
> >  		void *ptr;
> >  
> > -		page = __dev_alloc_pages(GFP_KERNEL,
> > get_order(length));
> > +		page = __dev_alloc_page(GFP_KERNEL);
> >  		if (!page)
> >  			goto unmap;
> >  
> > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > mtk_wed_device *wed, int size)
> >  					  wed->wlan.rx_size,
> >  					  DMA_TO_DEVICE);
> >  		if (unlikely(dma_mapping_error(dev->mt76.dev,
> > phy_addr))) {
> > -			__free_pages(page, get_order(length));
> > +			free_page(page);
> >  			goto unmap;
> >  		}
> >  
> > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > mtk_wed_device *wed, int size)
> >  		if (token < 0) {
> >  			dma_unmap_single(dev->mt76.dma_dev, phy_addr,
> >  					 wed->wlan.rx_size,
> > DMA_TO_DEVICE);
> > -			__free_pages(page, get_order(length));
> > +			free_page(page);
> >  			goto unmap;
> >  		}
> >
Sujuan Chen Dec. 22, 2022, 2:01 p.m. UTC | #9
On Thu, 2022-12-22 at 10:44 +0100, lorenzo@kernel.org wrote:
> > On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote:
> > > On Dec 21, Felix Fietkau wrote:
> > > > Hi Sujuan,
> > > > 
> > > > > Yes, it is so expensive, but if no memcopy, it will casue
> > > > > memory
> > > > > fragmentation (we hit this issue in internal SQC).
> > > > > 
> > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx
> > > > > buffer
> > > > > manager) with wifi driver(dma rx data queue) by exchanging
> > > > > wfdma
> > > > > dmad
> > > > > to ensure the free wed rx buffer.
> > > > > 
> > > > > it is possiable that a large number of buffer has been
> > > > > exchanged
> > > > > to wed
> > > > > and can not come back to wlan driver. So, the memory from the
> > > > > same 32K
> > > > > page cache is unable to be released, and it will be failed at
> > > > > page_frag_alloc in mt76_dma_rx_fill.
> > > > > 
> > > > > Any ideas but memcopy?
> > > > 
> > > > A simple solution would be to simply allocate single pages, or
> > > > half-page fragments.
> > > > 
> > > > - Felix
> > > > 
> > > 
> > > A simple approach would be allocating a single page for each rx
> > > buffer.
> > > 
> > > @Sujuan: can you please double check if it is ok from performance
> > > and
> > > memory
> > > 	 fragmentation point of view? If not I guess we can try to
> > > optimize it
> > > 	 and allocate multiple buffers in the same page tweeking page
> > > refcount.
> > > 
> > > (this patch must be applied on top of Felix's dma fix).
> > > 
> > 
> > Allocating single page for each rx buffer avoids memory
> > fragmentation,
> > but it always uses 4K for one rx pkt which only needs 2K, right?
> 
> correct, we can optimize it allocating multiple buffers (in this case
> 2,
> assuming 4K pages) in a single page and recycling the page.
> 
> > 
> > I guess performance would be worse without page cache.
> 
> I think it is a trade-off
> 
> > We have tested on the mtk private driver, 7% drop in throughput
> > when
> > setting the 4K page cache compared to the 32K page cache.
> > and 10% drop when use slab to allocate buffer.
> 
> IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or with a
> private
> page_frag_alloc() implementation) and you avoided memory allocation
> failures due to fragmentation but you got ~ 7% drop in throughput,
> correct?
> I think this is quite expected since we need to refill ~ 8 times more
> the
> page cache.
> 
> Not considering memory fragmentation, have you measured the impact of
> the
> memcpy of a full buffer?
> 

well, for pure sw path, it maybe ~300M drop (not sure) when using
memcpy(5G/HE80/4*4).
but we do memcpy only when wed on, atfer a few unbinding pkts, then all
flows are offloaded by hw. it is also a trade-off~

Regards,
Sujuan

> > 
> > A single page per rx buffer may cause a throughput drop of over 7%
> > and
> > waste memory, what do you think?
> 
> Implementing the page recycles as it is done in page_frag_alloc() we
> should get
> the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to 4K.
> 
> Regards,
> Lorenzo
> 
> > 
> > Regards,
> > Sujuan
> > 
> > > Regards,
> > > Lorenzo
> > > 
> > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> > > b/drivers/net/wireless/mediatek/mt76/dma.c
> > > index 28a7fe064313..1d9e580977fc 100644
> > > --- a/drivers/net/wireless/mediatek/mt76/dma.c
> > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev,
> > > struct mt76_queue *q,
> > >  	return ret;
> > >  }
> > >  
> > > +static void *
> > > +mt76_dma_get_rx_buf(struct mt76_queue *q)
> > > +{
> > > +	if ((q->flags & MT_QFLAG_WED) &&
> > > +	    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
> > > +		/* WED RX queue */
> > > +		struct page *page = dev_alloc_page();
> > > +
> > > +		return page ? page_address(page) : NULL;
> > > +	}
> > > +
> > > +	return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
> > > +}
> > > +
> > >  static int
> > >  mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
> > >  {
> > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> > > mt76_queue *q)
> > >  		struct mt76_queue_buf qbuf;
> > >  		void *buf = NULL;
> > >  
> > > -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> > > GFP_ATOMIC);
> > > +		buf = mt76_dma_get_rx_buf(q);
> > >  		if (!buf)
> > >  			break;
> > >  
> > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > index 1a2e4df8d1b5..2924e71e4fbe 100644
> > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > @@ -594,13 +594,9 @@ static void
> > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed)
> > >  static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device
> > > *wed)
> > >  {
> > >  	struct mt7915_dev *dev;
> > > -	u32 length;
> > >  	int i;
> > >  
> > >  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> > > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> > > -				sizeof(struct skb_shared_info));
> > > -
> > >  	for (i = 0; i < dev->mt76.rx_token_size; i++) {
> > >  		struct mt76_txwi_cache *t;
> > >  
> > > @@ -610,7 +606,7 @@ static void
> > > mt7915_mmio_wed_release_rx_buf(struct
> > > mtk_wed_device *wed)
> > >  
> > >  		dma_unmap_single(dev->mt76.dma_dev, t->dma_addr,
> > >  				 wed->wlan.rx_size, DMA_FROM_DEVICE);
> > > -		__free_pages(virt_to_page(t->ptr), get_order(length));
> > > +		free_page(virt_to_page(t->ptr));
> > >  		t->ptr = NULL;
> > >  
> > >  		mt76_put_rxwi(&dev->mt76, t);
> > > @@ -621,13 +617,9 @@ static u32
> > > mt7915_mmio_wed_init_rx_buf(struct
> > > mtk_wed_device *wed, int size)
> > >  {
> > >  	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
> > >  	struct mt7915_dev *dev;
> > > -	u32 length;
> > >  	int i;
> > >  
> > >  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> > > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> > > -				sizeof(struct skb_shared_info));
> > > -
> > >  	for (i = 0; i < size; i++) {
> > >  		struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76);
> > >  		dma_addr_t phy_addr;
> > > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > mtk_wed_device *wed, int size)
> > >  		int token;
> > >  		void *ptr;
> > >  
> > > -		page = __dev_alloc_pages(GFP_KERNEL,
> > > get_order(length));
> > > +		page = __dev_alloc_page(GFP_KERNEL);
> > >  		if (!page)
> > >  			goto unmap;
> > >  
> > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > mtk_wed_device *wed, int size)
> > >  					  wed->wlan.rx_size,
> > >  					  DMA_TO_DEVICE);
> > >  		if (unlikely(dma_mapping_error(dev->mt76.dev,
> > > phy_addr))) {
> > > -			__free_pages(page, get_order(length));
> > > +			free_page(page);
> > >  			goto unmap;
> > >  		}
> > >  
> > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > mtk_wed_device *wed, int size)
> > >  		if (token < 0) {
> > >  			dma_unmap_single(dev->mt76.dma_dev, phy_addr,
> > >  					 wed->wlan.rx_size,
> > > DMA_TO_DEVICE);
> > > -			__free_pages(page, get_order(length));
> > > +			free_page(page);
> > >  			goto unmap;
> > >  		}
> > >
Lorenzo Bianconi Dec. 22, 2022, 2:26 p.m. UTC | #10
> On Thu, 2022-12-22 at 10:44 +0100, lorenzo@kernel.org wrote:
> > > On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote:
> > > > On Dec 21, Felix Fietkau wrote:
> > > > > Hi Sujuan,
> > > > > 
> > > > > > Yes, it is so expensive, but if no memcopy, it will casue
> > > > > > memory
> > > > > > fragmentation (we hit this issue in internal SQC).
> > > > > > 
> > > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx
> > > > > > buffer
> > > > > > manager) with wifi driver(dma rx data queue) by exchanging
> > > > > > wfdma
> > > > > > dmad
> > > > > > to ensure the free wed rx buffer.
> > > > > > 
> > > > > > it is possiable that a large number of buffer has been
> > > > > > exchanged
> > > > > > to wed
> > > > > > and can not come back to wlan driver. So, the memory from the
> > > > > > same 32K
> > > > > > page cache is unable to be released, and it will be failed at
> > > > > > page_frag_alloc in mt76_dma_rx_fill.
> > > > > > 
> > > > > > Any ideas but memcopy?
> > > > > 
> > > > > A simple solution would be to simply allocate single pages, or
> > > > > half-page fragments.
> > > > > 
> > > > > - Felix
> > > > > 
> > > > 
> > > > A simple approach would be allocating a single page for each rx
> > > > buffer.
> > > > 
> > > > @Sujuan: can you please double check if it is ok from performance
> > > > and
> > > > memory
> > > > 	 fragmentation point of view? If not I guess we can try to
> > > > optimize it
> > > > 	 and allocate multiple buffers in the same page tweeking page
> > > > refcount.
> > > > 
> > > > (this patch must be applied on top of Felix's dma fix).
> > > > 
> > > 
> > > Allocating single page for each rx buffer avoids memory
> > > fragmentation,
> > > but it always uses 4K for one rx pkt which only needs 2K, right?
> > 
> > correct, we can optimize it allocating multiple buffers (in this case
> > 2,
> > assuming 4K pages) in a single page and recycling the page.
> > 
> > > 
> > > I guess performance would be worse without page cache.
> > 
> > I think it is a trade-off
> > 
> > > We have tested on the mtk private driver, 7% drop in throughput
> > > when
> > > setting the 4K page cache compared to the 32K page cache.
> > > and 10% drop when use slab to allocate buffer.
> > 
> > IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or with a
> > private
> > page_frag_alloc() implementation) and you avoided memory allocation
> > failures due to fragmentation but you got ~ 7% drop in throughput,
> > correct?
> > I think this is quite expected since we need to refill ~ 8 times more
> > the
> > page cache.
> > 
> > Not considering memory fragmentation, have you measured the impact of
> > the
> > memcpy of a full buffer?
> > 
> 
> well, for pure sw path, it maybe ~300M drop (not sure) when using
> memcpy(5G/HE80/4*4).
> but we do memcpy only when wed on, atfer a few unbinding pkts, then all
> flows are offloaded by hw. it is also a trade-off~

I guess we have quite a big impact if we enable wed and we do not actually
offload any flow (or we do not enable hw flowtable in netfilter), dont' we?

Regards,
Lorenzo

> 
> Regards,
> Sujuan
> 
> > > 
> > > A single page per rx buffer may cause a throughput drop of over 7%
> > > and
> > > waste memory, what do you think?
> > 
> > Implementing the page recycles as it is done in page_frag_alloc() we
> > should get
> > the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to 4K.
> > 
> > Regards,
> > Lorenzo
> > 
> > > 
> > > Regards,
> > > Sujuan
> > > 
> > > > Regards,
> > > > Lorenzo
> > > > 
> > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> > > > b/drivers/net/wireless/mediatek/mt76/dma.c
> > > > index 28a7fe064313..1d9e580977fc 100644
> > > > --- a/drivers/net/wireless/mediatek/mt76/dma.c
> > > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> > > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev,
> > > > struct mt76_queue *q,
> > > >  	return ret;
> > > >  }
> > > >  
> > > > +static void *
> > > > +mt76_dma_get_rx_buf(struct mt76_queue *q)
> > > > +{
> > > > +	if ((q->flags & MT_QFLAG_WED) &&
> > > > +	    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
> > > > +		/* WED RX queue */
> > > > +		struct page *page = dev_alloc_page();
> > > > +
> > > > +		return page ? page_address(page) : NULL;
> > > > +	}
> > > > +
> > > > +	return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
> > > > +}
> > > > +
> > > >  static int
> > > >  mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
> > > >  {
> > > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> > > > mt76_queue *q)
> > > >  		struct mt76_queue_buf qbuf;
> > > >  		void *buf = NULL;
> > > >  
> > > > -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> > > > GFP_ATOMIC);
> > > > +		buf = mt76_dma_get_rx_buf(q);
> > > >  		if (!buf)
> > > >  			break;
> > > >  
> > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > index 1a2e4df8d1b5..2924e71e4fbe 100644
> > > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > @@ -594,13 +594,9 @@ static void
> > > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed)
> > > >  static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device
> > > > *wed)
> > > >  {
> > > >  	struct mt7915_dev *dev;
> > > > -	u32 length;
> > > >  	int i;
> > > >  
> > > >  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> > > > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> > > > -				sizeof(struct skb_shared_info));
> > > > -
> > > >  	for (i = 0; i < dev->mt76.rx_token_size; i++) {
> > > >  		struct mt76_txwi_cache *t;
> > > >  
> > > > @@ -610,7 +606,7 @@ static void
> > > > mt7915_mmio_wed_release_rx_buf(struct
> > > > mtk_wed_device *wed)
> > > >  
> > > >  		dma_unmap_single(dev->mt76.dma_dev, t->dma_addr,
> > > >  				 wed->wlan.rx_size, DMA_FROM_DEVICE);
> > > > -		__free_pages(virt_to_page(t->ptr), get_order(length));
> > > > +		free_page(virt_to_page(t->ptr));
> > > >  		t->ptr = NULL;
> > > >  
> > > >  		mt76_put_rxwi(&dev->mt76, t);
> > > > @@ -621,13 +617,9 @@ static u32
> > > > mt7915_mmio_wed_init_rx_buf(struct
> > > > mtk_wed_device *wed, int size)
> > > >  {
> > > >  	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
> > > >  	struct mt7915_dev *dev;
> > > > -	u32 length;
> > > >  	int i;
> > > >  
> > > >  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> > > > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> > > > -				sizeof(struct skb_shared_info));
> > > > -
> > > >  	for (i = 0; i < size; i++) {
> > > >  		struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76);
> > > >  		dma_addr_t phy_addr;
> > > > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > > mtk_wed_device *wed, int size)
> > > >  		int token;
> > > >  		void *ptr;
> > > >  
> > > > -		page = __dev_alloc_pages(GFP_KERNEL,
> > > > get_order(length));
> > > > +		page = __dev_alloc_page(GFP_KERNEL);
> > > >  		if (!page)
> > > >  			goto unmap;
> > > >  
> > > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > > mtk_wed_device *wed, int size)
> > > >  					  wed->wlan.rx_size,
> > > >  					  DMA_TO_DEVICE);
> > > >  		if (unlikely(dma_mapping_error(dev->mt76.dev,
> > > > phy_addr))) {
> > > > -			__free_pages(page, get_order(length));
> > > > +			free_page(page);
> > > >  			goto unmap;
> > > >  		}
> > > >  
> > > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > > mtk_wed_device *wed, int size)
> > > >  		if (token < 0) {
> > > >  			dma_unmap_single(dev->mt76.dma_dev, phy_addr,
> > > >  					 wed->wlan.rx_size,
> > > > DMA_TO_DEVICE);
> > > > -			__free_pages(page, get_order(length));
> > > > +			free_page(page);
> > > >  			goto unmap;
> > > >  		}
> > > >
Lorenzo Bianconi Dec. 22, 2022, 11:57 p.m. UTC | #11
> > On Thu, 2022-12-22 at 10:44 +0100, lorenzo@kernel.org wrote:
> > > > On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote:
> > > > > On Dec 21, Felix Fietkau wrote:
> > > > > > Hi Sujuan,
> > > > > > 
> > > > > > > Yes, it is so expensive, but if no memcopy, it will casue
> > > > > > > memory
> > > > > > > fragmentation (we hit this issue in internal SQC).
> > > > > > > 
> > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx
> > > > > > > buffer
> > > > > > > manager) with wifi driver(dma rx data queue) by exchanging
> > > > > > > wfdma
> > > > > > > dmad
> > > > > > > to ensure the free wed rx buffer.
> > > > > > > 
> > > > > > > it is possiable that a large number of buffer has been
> > > > > > > exchanged
> > > > > > > to wed
> > > > > > > and can not come back to wlan driver. So, the memory from the
> > > > > > > same 32K
> > > > > > > page cache is unable to be released, and it will be failed at
> > > > > > > page_frag_alloc in mt76_dma_rx_fill.
> > > > > > > 
> > > > > > > Any ideas but memcopy?
> > > > > > 
> > > > > > A simple solution would be to simply allocate single pages, or
> > > > > > half-page fragments.
> > > > > > 
> > > > > > - Felix
> > > > > > 
> > > > > 
> > > > > A simple approach would be allocating a single page for each rx
> > > > > buffer.
> > > > > 
> > > > > @Sujuan: can you please double check if it is ok from performance
> > > > > and
> > > > > memory
> > > > > 	 fragmentation point of view? If not I guess we can try to
> > > > > optimize it
> > > > > 	 and allocate multiple buffers in the same page tweeking page
> > > > > refcount.
> > > > > 
> > > > > (this patch must be applied on top of Felix's dma fix).
> > > > > 
> > > > 
> > > > Allocating single page for each rx buffer avoids memory
> > > > fragmentation,
> > > > but it always uses 4K for one rx pkt which only needs 2K, right?
> > > 
> > > correct, we can optimize it allocating multiple buffers (in this case
> > > 2,
> > > assuming 4K pages) in a single page and recycling the page.
> > > 
> > > > 
> > > > I guess performance would be worse without page cache.
> > > 
> > > I think it is a trade-off
> > > 
> > > > We have tested on the mtk private driver, 7% drop in throughput
> > > > when
> > > > setting the 4K page cache compared to the 32K page cache.
> > > > and 10% drop when use slab to allocate buffer.
> > > 
> > > IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or with a
> > > private
> > > page_frag_alloc() implementation) and you avoided memory allocation
> > > failures due to fragmentation but you got ~ 7% drop in throughput,
> > > correct?
> > > I think this is quite expected since we need to refill ~ 8 times more
> > > the
> > > page cache.
> > > 
> > > Not considering memory fragmentation, have you measured the impact of
> > > the
> > > memcpy of a full buffer?
> > > 
> > 
> > well, for pure sw path, it maybe ~300M drop (not sure) when using
> > memcpy(5G/HE80/4*4).
> > but we do memcpy only when wed on, atfer a few unbinding pkts, then all
> > flows are offloaded by hw. it is also a trade-off~
> 
> I guess we have quite a big impact if we enable wed and we do not actually
> offload any flow (or we do not enable hw flowtable in netfilter), dont' we?

Hi Sujuan,

here ([0],[1]) I switched mt76 to page_pool allocator [2]. page_pool allows us to
allocate pages of a requested order (order 0 for mt76) and split them between
multiple buffers (2 buffers per page for mt7915). Moreover pp allocator recycles
the pages in order to avoid the page allocator overhead and keep the pages DMA mapped
if requested (I implemented the support for mt7915).
The code is stable (with and without WED) but I have not run any performance
tests. Can you please help with them to see if this code helps solving the
discussed issue?

Regards,
Lorenzo

[0] https://github.com/LorenzoBianconi/wireless-drivers-next/commit/61ce8e9be04b40fa8a1c0f2778d266982be32d7a
[1] https://github.com/LorenzoBianconi/wireless-drivers-next/commit/f54eefce06d389b155574092d246b21631e42d47
[2] https://docs.kernel.org/networking/page_pool.html

> 
> Regards,
> Lorenzo
> 
> > 
> > Regards,
> > Sujuan
> > 
> > > > 
> > > > A single page per rx buffer may cause a throughput drop of over 7%
> > > > and
> > > > waste memory, what do you think?
> > > 
> > > Implementing the page recycles as it is done in page_frag_alloc() we
> > > should get
> > > the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to 4K.
> > > 
> > > Regards,
> > > Lorenzo
> > > 
> > > > 
> > > > Regards,
> > > > Sujuan
> > > > 
> > > > > Regards,
> > > > > Lorenzo
> > > > > 
> > > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> > > > > b/drivers/net/wireless/mediatek/mt76/dma.c
> > > > > index 28a7fe064313..1d9e580977fc 100644
> > > > > --- a/drivers/net/wireless/mediatek/mt76/dma.c
> > > > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> > > > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev,
> > > > > struct mt76_queue *q,
> > > > >  	return ret;
> > > > >  }
> > > > >  
> > > > > +static void *
> > > > > +mt76_dma_get_rx_buf(struct mt76_queue *q)
> > > > > +{
> > > > > +	if ((q->flags & MT_QFLAG_WED) &&
> > > > > +	    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
> > > > > +		/* WED RX queue */
> > > > > +		struct page *page = dev_alloc_page();
> > > > > +
> > > > > +		return page ? page_address(page) : NULL;
> > > > > +	}
> > > > > +
> > > > > +	return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
> > > > > +}
> > > > > +
> > > > >  static int
> > > > >  mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
> > > > >  {
> > > > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> > > > > mt76_queue *q)
> > > > >  		struct mt76_queue_buf qbuf;
> > > > >  		void *buf = NULL;
> > > > >  
> > > > > -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> > > > > GFP_ATOMIC);
> > > > > +		buf = mt76_dma_get_rx_buf(q);
> > > > >  		if (!buf)
> > > > >  			break;
> > > > >  
> > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > > index 1a2e4df8d1b5..2924e71e4fbe 100644
> > > > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > > @@ -594,13 +594,9 @@ static void
> > > > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed)
> > > > >  static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device
> > > > > *wed)
> > > > >  {
> > > > >  	struct mt7915_dev *dev;
> > > > > -	u32 length;
> > > > >  	int i;
> > > > >  
> > > > >  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> > > > > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> > > > > -				sizeof(struct skb_shared_info));
> > > > > -
> > > > >  	for (i = 0; i < dev->mt76.rx_token_size; i++) {
> > > > >  		struct mt76_txwi_cache *t;
> > > > >  
> > > > > @@ -610,7 +606,7 @@ static void
> > > > > mt7915_mmio_wed_release_rx_buf(struct
> > > > > mtk_wed_device *wed)
> > > > >  
> > > > >  		dma_unmap_single(dev->mt76.dma_dev, t->dma_addr,
> > > > >  				 wed->wlan.rx_size, DMA_FROM_DEVICE);
> > > > > -		__free_pages(virt_to_page(t->ptr), get_order(length));
> > > > > +		free_page(virt_to_page(t->ptr));
> > > > >  		t->ptr = NULL;
> > > > >  
> > > > >  		mt76_put_rxwi(&dev->mt76, t);
> > > > > @@ -621,13 +617,9 @@ static u32
> > > > > mt7915_mmio_wed_init_rx_buf(struct
> > > > > mtk_wed_device *wed, int size)
> > > > >  {
> > > > >  	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
> > > > >  	struct mt7915_dev *dev;
> > > > > -	u32 length;
> > > > >  	int i;
> > > > >  
> > > > >  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> > > > > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> > > > > -				sizeof(struct skb_shared_info));
> > > > > -
> > > > >  	for (i = 0; i < size; i++) {
> > > > >  		struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76);
> > > > >  		dma_addr_t phy_addr;
> > > > > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > > > mtk_wed_device *wed, int size)
> > > > >  		int token;
> > > > >  		void *ptr;
> > > > >  
> > > > > -		page = __dev_alloc_pages(GFP_KERNEL,
> > > > > get_order(length));
> > > > > +		page = __dev_alloc_page(GFP_KERNEL);
> > > > >  		if (!page)
> > > > >  			goto unmap;
> > > > >  
> > > > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > > > mtk_wed_device *wed, int size)
> > > > >  					  wed->wlan.rx_size,
> > > > >  					  DMA_TO_DEVICE);
> > > > >  		if (unlikely(dma_mapping_error(dev->mt76.dev,
> > > > > phy_addr))) {
> > > > > -			__free_pages(page, get_order(length));
> > > > > +			free_page(page);
> > > > >  			goto unmap;
> > > > >  		}
> > > > >  
> > > > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> > > > > mtk_wed_device *wed, int size)
> > > > >  		if (token < 0) {
> > > > >  			dma_unmap_single(dev->mt76.dma_dev, phy_addr,
> > > > >  					 wed->wlan.rx_size,
> > > > > DMA_TO_DEVICE);
> > > > > -			__free_pages(page, get_order(length));
> > > > > +			free_page(page);
> > > > >  			goto unmap;
> > > > >  		}
> > > > >
Sujuan Chen Jan. 3, 2023, 1:55 a.m. UTC | #12
On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote:
> On Dec 21, Felix Fietkau wrote:
> > Hi Sujuan,
> > 
> > > Yes, it is so expensive, but if no memcopy, it will casue memory
> > > fragmentation (we hit this issue in internal SQC).
> > > 
> > > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer
> > > manager) with wifi driver(dma rx data queue) by exchanging wfdma
> > > dmad
> > > to ensure the free wed rx buffer.
> > > 
> > > it is possiable that a large number of buffer has been exchanged
> > > to wed
> > > and can not come back to wlan driver. So, the memory from the
> > > same 32K
> > > page cache is unable to be released, and it will be failed at
> > > page_frag_alloc in mt76_dma_rx_fill.
> > > 
> > > Any ideas but memcopy?
> > 
> > A simple solution would be to simply allocate single pages, or
> > half-page fragments.
> > 
> > - Felix
> > 
> 
> A simple approach would be allocating a single page for each rx
> buffer.
> 
> @Sujuan: can you please double check if it is ok from performance and
> memory
> 	 fragmentation point of view? If not I guess we can try to
> optimize it
> 	 and allocate multiple buffers in the same page tweeking page
> refcount.
> 
> (this patch must be applied on top of Felix's dma fix).
> 

Allocating single page for each rx buffer avoids memory fragmentation,
but it always uses 4K for one rx pkt which only needs 2K, right?

I guess performance would be worse without page cache.
We have tested on the mtk private driver, 7% drop in throughput when
setting the 4K page cache compared to the 32K page cache.
and 10% drop when use slab to allocate buffer.

A single page per rx buffer may cause a throughput drop of over 7% and
waste memory, what do you think?

Regards,
Sujuan

> Regards,
> Lorenzo
> 
> diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> b/drivers/net/wireless/mediatek/mt76/dma.c
> index 28a7fe064313..1d9e580977fc 100644
> --- a/drivers/net/wireless/mediatek/mt76/dma.c
> +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev,
> struct mt76_queue *q,
>  	return ret;
>  }
>  
> +static void *
> +mt76_dma_get_rx_buf(struct mt76_queue *q)
> +{
> +	if ((q->flags & MT_QFLAG_WED) &&
> +	    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) {
> +		/* WED RX queue */
> +		struct page *page = dev_alloc_page();
> +
> +		return page ? page_address(page) : NULL;
> +	}
> +
> +	return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
> +}
> +
>  static int
>  mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
>  {
> @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct
> mt76_queue *q)
>  		struct mt76_queue_buf qbuf;
>  		void *buf = NULL;
>  
> -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> GFP_ATOMIC);
> +		buf = mt76_dma_get_rx_buf(q);
>  		if (!buf)
>  			break;
>  
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> index 1a2e4df8d1b5..2924e71e4fbe 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> @@ -594,13 +594,9 @@ static void
> mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed)
>  static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device
> *wed)
>  {
>  	struct mt7915_dev *dev;
> -	u32 length;
>  	int i;
>  
>  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> -				sizeof(struct skb_shared_info));
> -
>  	for (i = 0; i < dev->mt76.rx_token_size; i++) {
>  		struct mt76_txwi_cache *t;
>  
> @@ -610,7 +606,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct
> mtk_wed_device *wed)
>  
>  		dma_unmap_single(dev->mt76.dma_dev, t->dma_addr,
>  				 wed->wlan.rx_size, DMA_FROM_DEVICE);
> -		__free_pages(virt_to_page(t->ptr), get_order(length));
> +		free_page(virt_to_page(t->ptr));
>  		t->ptr = NULL;
>  
>  		mt76_put_rxwi(&dev->mt76, t);
> @@ -621,13 +617,9 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> mtk_wed_device *wed, int size)
>  {
>  	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
>  	struct mt7915_dev *dev;
> -	u32 length;
>  	int i;
>  
>  	dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed);
> -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size +
> -				sizeof(struct skb_shared_info));
> -
>  	for (i = 0; i < size; i++) {
>  		struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76);
>  		dma_addr_t phy_addr;
> @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> mtk_wed_device *wed, int size)
>  		int token;
>  		void *ptr;
>  
> -		page = __dev_alloc_pages(GFP_KERNEL,
> get_order(length));
> +		page = __dev_alloc_page(GFP_KERNEL);
>  		if (!page)
>  			goto unmap;
>  
> @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> mtk_wed_device *wed, int size)
>  					  wed->wlan.rx_size,
>  					  DMA_TO_DEVICE);
>  		if (unlikely(dma_mapping_error(dev->mt76.dev,
> phy_addr))) {
> -			__free_pages(page, get_order(length));
> +			free_page(page);
>  			goto unmap;
>  		}
>  
> @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct
> mtk_wed_device *wed, int size)
>  		if (token < 0) {
>  			dma_unmap_single(dev->mt76.dma_dev, phy_addr,
>  					 wed->wlan.rx_size,
> DMA_TO_DEVICE);
> -			__free_pages(page, get_order(length));
> +			free_page(page);
>  			goto unmap;
>  		}
>
Sujuan Chen Jan. 6, 2023, 4:04 a.m. UTC | #13
On Fri, 2022-12-23 at 00:57 +0100, lorenzo@kernel.org wrote:
> > > On Thu, 2022-12-22 at 10:44 +0100, lorenzo@kernel.org wrote:
> > > > > On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote:
> > > > > > On Dec 21, Felix Fietkau wrote:
> > > > > > > Hi Sujuan,
> > > > > > > 
> > > > > > > > Yes, it is so expensive, but if no memcopy, it will
> > > > > > > > casue
> > > > > > > > memory
> > > > > > > > fragmentation (we hit this issue in internal SQC).
> > > > > > > > 
> > > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed
> > > > > > > > rx
> > > > > > > > buffer
> > > > > > > > manager) with wifi driver(dma rx data queue) by
> > > > > > > > exchanging
> > > > > > > > wfdma
> > > > > > > > dmad
> > > > > > > > to ensure the free wed rx buffer.
> > > > > > > > 
> > > > > > > > it is possiable that a large number of buffer has been
> > > > > > > > exchanged
> > > > > > > > to wed
> > > > > > > > and can not come back to wlan driver. So, the memory
> > > > > > > > from the
> > > > > > > > same 32K
> > > > > > > > page cache is unable to be released, and it will be
> > > > > > > > failed at
> > > > > > > > page_frag_alloc in mt76_dma_rx_fill.
> > > > > > > > 
> > > > > > > > Any ideas but memcopy?
> > > > > > > 
> > > > > > > A simple solution would be to simply allocate single
> > > > > > > pages, or
> > > > > > > half-page fragments.
> > > > > > > 
> > > > > > > - Felix
> > > > > > > 
> > > > > > 
> > > > > > A simple approach would be allocating a single page for
> > > > > > each rx
> > > > > > buffer.
> > > > > > 
> > > > > > @Sujuan: can you please double check if it is ok from
> > > > > > performance
> > > > > > and
> > > > > > memory
> > > > > > 	 fragmentation point of view? If not I guess we can try
> > > > > > to
> > > > > > optimize it
> > > > > > 	 and allocate multiple buffers in the same page
> > > > > > tweeking page
> > > > > > refcount.
> > > > > > 
> > > > > > (this patch must be applied on top of Felix's dma fix).
> > > > > > 
> > > > > 
> > > > > Allocating single page for each rx buffer avoids memory
> > > > > fragmentation,
> > > > > but it always uses 4K for one rx pkt which only needs 2K,
> > > > > right?
> > > > 
> > > > correct, we can optimize it allocating multiple buffers (in
> > > > this case
> > > > 2,
> > > > assuming 4K pages) in a single page and recycling the page.
> > > > 
> > > > > 
> > > > > I guess performance would be worse without page cache.
> > > > 
> > > > I think it is a trade-off
> > > > 
> > > > > We have tested on the mtk private driver, 7% drop in
> > > > > throughput
> > > > > when
> > > > > setting the 4K page cache compared to the 32K page cache.
> > > > > and 10% drop when use slab to allocate buffer.
> > > > 
> > > > IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or
> > > > with a
> > > > private
> > > > page_frag_alloc() implementation) and you avoided memory
> > > > allocation
> > > > failures due to fragmentation but you got ~ 7% drop in
> > > > throughput,
> > > > correct?
> > > > I think this is quite expected since we need to refill ~ 8
> > > > times more
> > > > the
> > > > page cache.
> > > > 
> > > > Not considering memory fragmentation, have you measured the
> > > > impact of
> > > > the
> > > > memcpy of a full buffer?
> > > > 
> > > 
> > > well, for pure sw path, it maybe ~300M drop (not sure) when using
> > > memcpy(5G/HE80/4*4).
> > > but we do memcpy only when wed on, atfer a few unbinding pkts,
> > > then all
> > > flows are offloaded by hw. it is also a trade-off~
> > 
> > I guess we have quite a big impact if we enable wed and we do not
> > actually
> > offload any flow (or we do not enable hw flowtable in netfilter),
> > dont' we?
> 
> Hi Sujuan,
> 
> here ([0],[1]) I switched mt76 to page_pool allocator [2]. page_pool
> allows us to
> allocate pages of a requested order (order 0 for mt76) and split them
> between
> multiple buffers (2 buffers per page for mt7915). Moreover pp
> allocator recycles
> the pages in order to avoid the page allocator overhead and keep the
> pages DMA mapped
> if requested (I implemented the support for mt7915).
> The code is stable (with and without WED) but I have not run any
> performance
> tests. Can you please help with them to see if this code helps
> solving the
> discussed issue?
> 

Hi Lore,


I tried to apply your patches into my codebase(kernel-5.x). However
since page pool api coming from kernel-6.x does not exist in kernel-
5.x, I can not build a proper image for testing (even thought we
applied 30+ page pool patches).

Sorry for not helping you.

Reagrds,
Sujuan

> Regards,
> Lorenzo
> 
> [0] 
> https://github.com/LorenzoBianconi/wireless-drivers-next/commit/61ce8e9be04b40fa8a1c0f2778d266982be32d7a
> [1] 
> https://github.com/LorenzoBianconi/wireless-drivers-next/commit/f54eefce06d389b155574092d246b21631e42d47
> [2] https://docs.kernel.org/networking/page_pool.html
> 
> > 
> > Regards,
> > Lorenzo
> > 
> > > 
> > > Regards,
> > > Sujuan
> > > 
> > > > > 
> > > > > A single page per rx buffer may cause a throughput drop of
> > > > > over 7%
> > > > > and
> > > > > waste memory, what do you think?
> > > > 
> > > > Implementing the page recycles as it is done in
> > > > page_frag_alloc() we
> > > > should get
> > > > the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to
> > > > 4K.
> > > > 
> > > > Regards,
> > > > Lorenzo
> > > > 
> > > > > 
> > > > > Regards,
> > > > > Sujuan
> > > > > 
> > > > > > Regards,
> > > > > > Lorenzo
> > > > > > 
> > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c
> > > > > > b/drivers/net/wireless/mediatek/mt76/dma.c
> > > > > > index 28a7fe064313..1d9e580977fc 100644
> > > > > > --- a/drivers/net/wireless/mediatek/mt76/dma.c
> > > > > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c
> > > > > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev
> > > > > > *dev,
> > > > > > struct mt76_queue *q,
> > > > > >  	return ret;
> > > > > >  }
> > > > > >  
> > > > > > +static void *
> > > > > > +mt76_dma_get_rx_buf(struct mt76_queue *q)
> > > > > > +{
> > > > > > +	if ((q->flags & MT_QFLAG_WED) &&
> > > > > > +	    FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) ==
> > > > > > MT76_WED_Q_RX) {
> > > > > > +		/* WED RX queue */
> > > > > > +		struct page *page = dev_alloc_page();
> > > > > > +
> > > > > > +		return page ? page_address(page) : NULL;
> > > > > > +	}
> > > > > > +
> > > > > > +	return page_frag_alloc(&q->rx_page, q->buf_size,
> > > > > > GFP_ATOMIC);
> > > > > > +}
> > > > > > +
> > > > > >  static int
> > > > > >  mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue
> > > > > > *q)
> > > > > >  {
> > > > > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev,
> > > > > > struct
> > > > > > mt76_queue *q)
> > > > > >  		struct mt76_queue_buf qbuf;
> > > > > >  		void *buf = NULL;
> > > > > >  
> > > > > > -		buf = page_frag_alloc(&q->rx_page, q->buf_size,
> > > > > > GFP_ATOMIC);
> > > > > > +		buf = mt76_dma_get_rx_buf(q);
> > > > > >  		if (!buf)
> > > > > >  			break;
> > > > > >  
> > > > > > diff --git
> > > > > > a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > > > index 1a2e4df8d1b5..2924e71e4fbe 100644
> > > > > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
> > > > > > @@ -594,13 +594,9 @@ static void
> > > > > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed)
> > > > > >  static void mt7915_mmio_wed_release_rx_buf(struct
> > > > > > mtk_wed_device
> > > > > > *wed)
> > > > > >  {
> > > > > >  	struct mt7915_dev *dev;
> > > > > > -	u32 length;
> > > > > >  	int i;
> > > > > >  
> > > > > >  	dev = container_of(wed, struct mt7915_dev,
> > > > > > mt76.mmio.wed);
> > > > > > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size 
> > > > > > +
> > > > > > -				sizeof(struct
> > > > > > skb_shared_info));
> > > > > > -
> > > > > >  	for (i = 0; i < dev->mt76.rx_token_size; i++) {
> > > > > >  		struct mt76_txwi_cache *t;
> > > > > >  
> > > > > > @@ -610,7 +606,7 @@ static void
> > > > > > mt7915_mmio_wed_release_rx_buf(struct
> > > > > > mtk_wed_device *wed)
> > > > > >  
> > > > > >  		dma_unmap_single(dev->mt76.dma_dev, t-
> > > > > > >dma_addr,
> > > > > >  				 wed->wlan.rx_size,
> > > > > > DMA_FROM_DEVICE);
> > > > > > -		__free_pages(virt_to_page(t->ptr),
> > > > > > get_order(length));
> > > > > > +		free_page(virt_to_page(t->ptr));
> > > > > >  		t->ptr = NULL;
> > > > > >  
> > > > > >  		mt76_put_rxwi(&dev->mt76, t);
> > > > > > @@ -621,13 +617,9 @@ static u32
> > > > > > mt7915_mmio_wed_init_rx_buf(struct
> > > > > > mtk_wed_device *wed, int size)
> > > > > >  {
> > > > > >  	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
> > > > > >  	struct mt7915_dev *dev;
> > > > > > -	u32 length;
> > > > > >  	int i;
> > > > > >  
> > > > > >  	dev = container_of(wed, struct mt7915_dev,
> > > > > > mt76.mmio.wed);
> > > > > > -	length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size 
> > > > > > +
> > > > > > -				sizeof(struct
> > > > > > skb_shared_info));
> > > > > > -
> > > > > >  	for (i = 0; i < size; i++) {
> > > > > >  		struct mt76_txwi_cache *t = mt76_get_rxwi(&dev-
> > > > > > >mt76);
> > > > > >  		dma_addr_t phy_addr;
> > > > > > @@ -635,7 +627,7 @@ static u32
> > > > > > mt7915_mmio_wed_init_rx_buf(struct
> > > > > > mtk_wed_device *wed, int size)
> > > > > >  		int token;
> > > > > >  		void *ptr;
> > > > > >  
> > > > > > -		page = __dev_alloc_pages(GFP_KERNEL,
> > > > > > get_order(length));
> > > > > > +		page = __dev_alloc_page(GFP_KERNEL);
> > > > > >  		if (!page)
> > > > > >  			goto unmap;
> > > > > >  
> > > > > > @@ -644,7 +636,7 @@ static u32
> > > > > > mt7915_mmio_wed_init_rx_buf(struct
> > > > > > mtk_wed_device *wed, int size)
> > > > > >  					  wed->wlan.rx_size,
> > > > > >  					  DMA_TO_DEVICE);
> > > > > >  		if (unlikely(dma_mapping_error(dev->mt76.dev,
> > > > > > phy_addr))) {
> > > > > > -			__free_pages(page, get_order(length));
> > > > > > +			free_page(page);
> > > > > >  			goto unmap;
> > > > > >  		}
> > > > > >  
> > > > > > @@ -653,7 +645,7 @@ static u32
> > > > > > mt7915_mmio_wed_init_rx_buf(struct
> > > > > > mtk_wed_device *wed, int size)
> > > > > >  		if (token < 0) {
> > > > > >  			dma_unmap_single(dev->mt76.dma_dev,
> > > > > > phy_addr,
> > > > > >  					 wed->wlan.rx_size,
> > > > > > DMA_TO_DEVICE);
> > > > > > -			__free_pages(page, get_order(length));
> > > > > > +			free_page(page);
> > > > > >  			goto unmap;
> > > > > >  		}
> > > > > >  
> 
>
diff mbox series

Patch

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index fc24b35..76ad47d 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -580,24 +580,29 @@  mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q)
 
 		buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC);
 		if (!buf)
-			break;
+			goto out;
 
 		addr = dma_map_single(dev->dma_dev, buf, len, DMA_FROM_DEVICE);
-		if (unlikely(dma_mapping_error(dev->dma_dev, addr))) {
-			skb_free_frag(buf);
-			break;
-		}
+		if (unlikely(dma_mapping_error(dev->dma_dev, addr)))
+			goto free;
 
 		qbuf.addr = addr + offset;
 		qbuf.len = len - offset;
 		qbuf.skip_unmap = false;
-		if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) {
-			dma_unmap_single(dev->dma_dev, addr, len,
-					 DMA_FROM_DEVICE);
-			skb_free_frag(buf);
-			break;
-		}
+		if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0)
+			goto umap;
+
 		frames++;
+		continue;
+
+umap:
+		dma_unmap_single(dev->dma_dev, addr, len,
+				 DMA_FROM_DEVICE);
+free:
+		skb_free_frag(buf);
+out:
+		mt76_put_rxwi(dev, t);
+		break;
 	}
 
 	if (frames)