Message ID | 0a218783098a6b1b90ff120c7182acee93dc8feb.1671423784.git.Bo.Jiao@mediatek.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Felix Fietkau |
Headers | show |
Series | [v2] wifi: mt76: fix potential memory leakage | expand |
> From: Bo Jiao <Bo.Jiao@mediatek.com> > > fix potential memory leakage, recycle rxwi when mt76_dma_add_buf() call fail. > > Signed-off-by: Bo Jiao <Bo.Jiao@mediatek.com> > --- > v2: > - recycle rxwi when page_frag_alloc() and dma_map_single() fail. > --- > drivers/net/wireless/mediatek/mt76/dma.c | 27 ++++++++++++++---------- > 1 file changed, 16 insertions(+), 11 deletions(-) > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c > index fc24b35..76ad47d 100644 > --- a/drivers/net/wireless/mediatek/mt76/dma.c > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > @@ -580,24 +580,29 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) > > buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); > if (!buf) > - break; > + goto out; > > addr = dma_map_single(dev->dma_dev, buf, len, DMA_FROM_DEVICE); > - if (unlikely(dma_mapping_error(dev->dma_dev, addr))) { > - skb_free_frag(buf); > - break; > - } > + if (unlikely(dma_mapping_error(dev->dma_dev, addr))) > + goto free; > > qbuf.addr = addr + offset; > qbuf.len = len - offset; > qbuf.skip_unmap = false; > - if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) { > - dma_unmap_single(dev->dma_dev, addr, len, > - DMA_FROM_DEVICE); > - skb_free_frag(buf); > - break; > - } > + if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) > + goto umap; > + > frames++; > + continue; > + > +umap: > + dma_unmap_single(dev->dma_dev, addr, len, > + DMA_FROM_DEVICE); > +free: > + skb_free_frag(buf); > +out: > + mt76_put_rxwi(dev, t); > + break; > } > > if (frames) Hi Bo, I guess in the way below, the code is more readable, what do you think? Regards, Lorenzo diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index fad5fe19fe18..001538f698f1 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -571,13 +571,6 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) struct mt76_queue_buf qbuf; void *buf = NULL; - if ((q->flags & MT_QFLAG_WED) && - FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { - t = mt76_get_rxwi(dev); - if (!t) - break; - } - buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); if (!buf) break; @@ -588,16 +581,27 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) break; } + if ((q->flags & MT_QFLAG_WED) && + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { + t = mt76_get_rxwi(dev); + if (!t) + goto unmap; + } + qbuf.addr = addr + offset; qbuf.len = len - offset; qbuf.skip_unmap = false; - if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) { - dma_unmap_single(dev->dma_dev, addr, len, - DMA_FROM_DEVICE); - skb_free_frag(buf); - break; + + if (!mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t)) { + frames++; + continue; } - frames++; + +unmap: + dma_unmap_single(dev->dma_dev, addr, len, DMA_FROM_DEVICE); + skb_free_frag(buf); + mt76_put_rxwi(dev, t); + break; } if (frames) > -- > 2.18.0 >
On Mon, 2022-12-19 at 10:55 +0100, Lorenzo Bianconi wrote: > > From: Bo Jiao <Bo.Jiao@mediatek.com> > > > > fix potential memory leakage, recycle rxwi when mt76_dma_add_buf() > > call fail. > > > > Signed-off-by: Bo Jiao <Bo.Jiao@mediatek.com> > > --- > > v2: > > - recycle rxwi when page_frag_alloc() and dma_map_single() fail. > > --- > > drivers/net/wireless/mediatek/mt76/dma.c | 27 ++++++++++++++---- > > ------ > > 1 file changed, 16 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > > b/drivers/net/wireless/mediatek/mt76/dma.c > > index fc24b35..76ad47d 100644 > > --- a/drivers/net/wireless/mediatek/mt76/dma.c > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > > @@ -580,24 +580,29 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > > mt76_queue *q) > > > > buf = page_frag_alloc(&q->rx_page, q->buf_size, > > GFP_ATOMIC); > > if (!buf) > > - break; > > + goto out; > > > > addr = dma_map_single(dev->dma_dev, buf, len, > > DMA_FROM_DEVICE); > > - if (unlikely(dma_mapping_error(dev->dma_dev, addr))) { > > - skb_free_frag(buf); > > - break; > > - } > > + if (unlikely(dma_mapping_error(dev->dma_dev, addr))) > > + goto free; > > > > qbuf.addr = addr + offset; > > qbuf.len = len - offset; > > qbuf.skip_unmap = false; > > - if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) > > { > > - dma_unmap_single(dev->dma_dev, addr, len, > > - DMA_FROM_DEVICE); > > - skb_free_frag(buf); > > - break; > > - } > > + if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) > > + goto umap; > > + > > frames++; > > + continue; > > + > > +umap: > > + dma_unmap_single(dev->dma_dev, addr, len, > > + DMA_FROM_DEVICE); > > +free: > > + skb_free_frag(buf); > > +out: > > + mt76_put_rxwi(dev, t); > > + break; > > } > > > > if (frames) > > Hi Bo, > > I guess in the way below, the code is more readable, what do you > think? > > Regards, > Lorenzo > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > b/drivers/net/wireless/mediatek/mt76/dma.c > index fad5fe19fe18..001538f698f1 100644 > --- a/drivers/net/wireless/mediatek/mt76/dma.c > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > @@ -571,13 +571,6 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > mt76_queue *q) > struct mt76_queue_buf qbuf; > void *buf = NULL; > > - if ((q->flags & MT_QFLAG_WED) && > - FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == > MT76_WED_Q_RX) { > - t = mt76_get_rxwi(dev); > - if (!t) > - break; > - } > - > buf = page_frag_alloc(&q->rx_page, q->buf_size, > GFP_ATOMIC); > if (!buf) > break; > @@ -588,16 +581,27 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > mt76_queue *q) > break; > } > > + if ((q->flags & MT_QFLAG_WED) && > + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == > MT76_WED_Q_RX) { > + t = mt76_get_rxwi(dev); > + if (!t) > + goto unmap; > + } > + > qbuf.addr = addr + offset; > qbuf.len = len - offset; > qbuf.skip_unmap = false; > - if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) > { > - dma_unmap_single(dev->dma_dev, addr, len, > - DMA_FROM_DEVICE); > - skb_free_frag(buf); > - break; > + > + if (!mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t)) { > + frames++; > + continue; > } > - frames++; > + > +unmap: > + dma_unmap_single(dev->dma_dev, addr, len, > DMA_FROM_DEVICE); > + skb_free_frag(buf); > + mt76_put_rxwi(dev, t); > + break; > } > > if (frames) > Hi Lore, we love your patch, but we have another patch to avoid memory fragmentation by duplicating the rx skb after mt76_dma_dequeue(). it requires mt76_get_rxwi() be placed before page_frag_alloc(). the below patch(need rebase) will be sent after the current patch is accepted. --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -386,9 +386,11 @@ mt76_dma_get_buf(struct mt76_dev *dev, struct mt76_queue *q, int idx, SKB_WITH_OVERHEAD(q->buf_size), DMA_FROM_DEVICE); - buf = t->ptr; + buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); + if (!buf) + return NULL; + memcpy(buf, t->ptr, SKB_WITH_OVERHEAD(q->buf_size)); t->dma_addr = 0; - t->ptr = NULL; mt76_put_rxwi(dev, t); @@ -569,6 +571,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) while (q->queued < q->ndesc - 1) { struct mt76_txwi_cache *t = NULL; struct mt76_queue_buf qbuf; + bool skip_alloc = false; void *buf = NULL; if ((q->flags & MT_QFLAG_WED) && @@ -576,11 +579,19 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) t = mt76_get_rxwi(dev); if (!t) break; + + if (t->ptr) { + skip_alloc = true; + buf = t->ptr; + } } - buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); - if (!buf) - break; + if (!skip_alloc) { + buf = page_frag_alloc(&q->rx_page, q->buf_size, + GFP_ATOMIC); + if (!buf) + break; + } addr = dma_map_single(dev->dma_dev, buf, len, DMA_FROM_DEVICE); if (unlikely(dma_mapping_error(dev->dma_dev, addr))) { -- 2.18.0 > > -- > > 2.18.0 > >
[...] > Hi Lore, Hi Sujuan, > > we love your patch, but we have another patch to avoid memory > fragmentation by duplicating the rx skb after mt76_dma_dequeue(). it > requires mt76_get_rxwi() be placed before page_frag_alloc(). > > the below patch(need rebase) will be sent after the current patch is > accepted. > > --- a/drivers/net/wireless/mediatek/mt76/dma.c > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > @@ -386,9 +386,11 @@ mt76_dma_get_buf(struct mt76_dev *dev, struct > mt76_queue *q, int idx, > SKB_WITH_OVERHEAD(q->buf_size), > DMA_FROM_DEVICE); > > - buf = t->ptr; > + buf = page_frag_alloc(&q->rx_page, q->buf_size, > GFP_ATOMIC); > + if (!buf) > + return NULL; > + memcpy(buf, t->ptr, SKB_WITH_OVERHEAD(q->buf_size)); We this approach we still need to allocate the buffer (in mt76_dma_get_buf() instead of mt76_dma_rx_fill()) but we need even to copy the full buffer that can be pretty expensive instead of relying on the DMA, so I would avoid this approach. Regards, Lorenzo > t->dma_addr = 0; > - t->ptr = NULL; > > mt76_put_rxwi(dev, t); > > @@ -569,6 +571,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > mt76_queue *q) > while (q->queued < q->ndesc - 1) { > struct mt76_txwi_cache *t = NULL; > struct mt76_queue_buf qbuf; > + bool skip_alloc = false; > void *buf = NULL; > > if ((q->flags & MT_QFLAG_WED) && > @@ -576,11 +579,19 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > mt76_queue *q) > t = mt76_get_rxwi(dev); > if (!t) > break; > + > + if (t->ptr) { > + skip_alloc = true; > + buf = t->ptr; > + } > } > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > GFP_ATOMIC); > - if (!buf) > - break; > + if (!skip_alloc) { > + buf = page_frag_alloc(&q->rx_page, q->buf_size, > + GFP_ATOMIC); > + if (!buf) > + break; > + } > > addr = dma_map_single(dev->dma_dev, buf, len, > DMA_FROM_DEVICE); > if (unlikely(dma_mapping_error(dev->dma_dev, addr))) { > -- > 2.18.0 > > > > -- > > > 2.18.0 > > >
On Wed, 2022-12-21 at 11:26 +0100, lorenzo@kernel.org wrote: > [...] > > > Hi Lore, > > Hi Sujuan, > > > > > we love your patch, but we have another patch to avoid memory > > fragmentation by duplicating the rx skb after mt76_dma_dequeue(). > > it > > requires mt76_get_rxwi() be placed before page_frag_alloc(). > > > > the below patch(need rebase) will be sent after the current patch > > is > > accepted. > > > > --- a/drivers/net/wireless/mediatek/mt76/dma.c > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > > @@ -386,9 +386,11 @@ mt76_dma_get_buf(struct mt76_dev *dev, struct > > mt76_queue *q, int idx, > > SKB_WITH_OVERHEAD(q->buf_size), > > DMA_FROM_DEVICE); > > > > - buf = t->ptr; > > + buf = page_frag_alloc(&q->rx_page, q->buf_size, > > GFP_ATOMIC); > > + if (!buf) > > + return NULL; > > + memcpy(buf, t->ptr, SKB_WITH_OVERHEAD(q- > > >buf_size)); > > We this approach we still need to allocate the buffer (in > mt76_dma_get_buf() > instead of mt76_dma_rx_fill()) but we need even to copy the full > buffer that > can be pretty expensive instead of relying on the DMA, so I would > avoid this > approach. > Hi Lore, Yes, it is so expensive, but if no memcopy, it will casue memory fragmentation (we hit this issue in internal SQC). as we know, wed needs to exchange rx pkt(belongs to wed rx buffer manager) with wifi driver(dma rx data queue) by exchanging wfdma dmad to ensure the free wed rx buffer. it is possiable that a large number of buffer has been exchanged to wed and can not come back to wlan driver. So, the memory from the same 32K page cache is unable to be released, and it will be failed at page_frag_alloc in mt76_dma_rx_fill. Any ideas but memcopy? Regards, Sujuan > Regards, > Lorenzo > > > t->dma_addr = 0; > > - t->ptr = NULL; > > > > mt76_put_rxwi(dev, t); > > > > @@ -569,6 +571,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > > mt76_queue *q) > > while (q->queued < q->ndesc - 1) { > > struct mt76_txwi_cache *t = NULL; > > struct mt76_queue_buf qbuf; > > + bool skip_alloc = false; > > void *buf = NULL; > > > > if ((q->flags & MT_QFLAG_WED) && > > @@ -576,11 +579,19 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > > mt76_queue *q) > > t = mt76_get_rxwi(dev); > > if (!t) > > break; > > + > > + if (t->ptr) { > > + skip_alloc = true; > > + buf = t->ptr; > > + } > > } > > > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > > GFP_ATOMIC); > > - if (!buf) > > - break; > > + if (!skip_alloc) { > > + buf = page_frag_alloc(&q->rx_page, q- > > >buf_size, > > + GFP_ATOMIC); > > + if (!buf) > > + break; > > + } > > > > addr = dma_map_single(dev->dma_dev, buf, len, > > DMA_FROM_DEVICE); > > if (unlikely(dma_mapping_error(dev->dma_dev, > > addr))) { > > -- > > 2.18.0 > > > > > > -- > > > > 2.18.0 > > > >
Hi Sujuan, > Yes, it is so expensive, but if no memcopy, it will casue memory > fragmentation (we hit this issue in internal SQC). > > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer > manager) with wifi driver(dma rx data queue) by exchanging wfdma dmad > to ensure the free wed rx buffer. > > it is possiable that a large number of buffer has been exchanged to wed > and can not come back to wlan driver. So, the memory from the same 32K > page cache is unable to be released, and it will be failed at > page_frag_alloc in mt76_dma_rx_fill. > > Any ideas but memcopy? A simple solution would be to simply allocate single pages, or half-page fragments. - Felix
On Dec 21, Felix Fietkau wrote: > Hi Sujuan, > > > Yes, it is so expensive, but if no memcopy, it will casue memory > > fragmentation (we hit this issue in internal SQC). > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer > > manager) with wifi driver(dma rx data queue) by exchanging wfdma dmad > > to ensure the free wed rx buffer. > > > > it is possiable that a large number of buffer has been exchanged to wed > > and can not come back to wlan driver. So, the memory from the same 32K > > page cache is unable to be released, and it will be failed at > > page_frag_alloc in mt76_dma_rx_fill. > > > > Any ideas but memcopy? > A simple solution would be to simply allocate single pages, or half-page fragments. > > - Felix > A simple approach would be allocating a single page for each rx buffer. @Sujuan: can you please double check if it is ok from performance and memory fragmentation point of view? If not I guess we can try to optimize it and allocate multiple buffers in the same page tweeking page refcount. (this patch must be applied on top of Felix's dma fix). Regards, Lorenzo diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index 28a7fe064313..1d9e580977fc 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, struct mt76_queue *q, return ret; } +static void * +mt76_dma_get_rx_buf(struct mt76_queue *q) +{ + if ((q->flags & MT_QFLAG_WED) && + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { + /* WED RX queue */ + struct page *page = dev_alloc_page(); + + return page ? page_address(page) : NULL; + } + + return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); +} + static int mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) { @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) struct mt76_queue_buf qbuf; void *buf = NULL; - buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); + buf = mt76_dma_get_rx_buf(q); if (!buf) break; diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c index 1a2e4df8d1b5..2924e71e4fbe 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c @@ -594,13 +594,9 @@ static void mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed) static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device *wed) { struct mt7915_dev *dev; - u32 length; int i; dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + - sizeof(struct skb_shared_info)); - for (i = 0; i < dev->mt76.rx_token_size; i++) { struct mt76_txwi_cache *t; @@ -610,7 +606,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device *wed) dma_unmap_single(dev->mt76.dma_dev, t->dma_addr, wed->wlan.rx_size, DMA_FROM_DEVICE); - __free_pages(virt_to_page(t->ptr), get_order(length)); + free_page(virt_to_page(t->ptr)); t->ptr = NULL; mt76_put_rxwi(&dev->mt76, t); @@ -621,13 +617,9 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size) { struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc; struct mt7915_dev *dev; - u32 length; int i; dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + - sizeof(struct skb_shared_info)); - for (i = 0; i < size; i++) { struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76); dma_addr_t phy_addr; @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size) int token; void *ptr; - page = __dev_alloc_pages(GFP_KERNEL, get_order(length)); + page = __dev_alloc_page(GFP_KERNEL); if (!page) goto unmap; @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size) wed->wlan.rx_size, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(dev->mt76.dev, phy_addr))) { - __free_pages(page, get_order(length)); + free_page(page); goto unmap; } @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size) if (token < 0) { dma_unmap_single(dev->mt76.dma_dev, phy_addr, wed->wlan.rx_size, DMA_TO_DEVICE); - __free_pages(page, get_order(length)); + free_page(page); goto unmap; }
On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote: > On Dec 21, Felix Fietkau wrote: > > Hi Sujuan, > > > > > Yes, it is so expensive, but if no memcopy, it will casue memory > > > fragmentation (we hit this issue in internal SQC). > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer > > > manager) with wifi driver(dma rx data queue) by exchanging wfdma > > > dmad > > > to ensure the free wed rx buffer. > > > > > > it is possiable that a large number of buffer has been exchanged > > > to wed > > > and can not come back to wlan driver. So, the memory from the > > > same 32K > > > page cache is unable to be released, and it will be failed at > > > page_frag_alloc in mt76_dma_rx_fill. > > > > > > Any ideas but memcopy? > > > > A simple solution would be to simply allocate single pages, or > > half-page fragments. > > > > - Felix > > > > A simple approach would be allocating a single page for each rx > buffer. > > @Sujuan: can you please double check if it is ok from performance and > memory > fragmentation point of view? If not I guess we can try to > optimize it > and allocate multiple buffers in the same page tweeking page > refcount. > > (this patch must be applied on top of Felix's dma fix). > Allocating single page for each rx buffer avoids memory fragmentation, but it always uses 4K for one rx pkt which only needs 2K, right? I guess performance would be worse without page cache. We have tested on the mtk private driver, 7% drop in throughput when setting the 4K page cache compared to the 32K page cache. and 10% drop when use slab to allocate buffer. A single page per rx buffer may cause a throughput drop of over 7% and waste memory, what do you think? Regards, Sujuan > Regards, > Lorenzo > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > b/drivers/net/wireless/mediatek/mt76/dma.c > index 28a7fe064313..1d9e580977fc 100644 > --- a/drivers/net/wireless/mediatek/mt76/dma.c > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, > struct mt76_queue *q, > return ret; > } > > +static void * > +mt76_dma_get_rx_buf(struct mt76_queue *q) > +{ > + if ((q->flags & MT_QFLAG_WED) && > + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { > + /* WED RX queue */ > + struct page *page = dev_alloc_page(); > + > + return page ? page_address(page) : NULL; > + } > + > + return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); > +} > + > static int > mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) > { > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > mt76_queue *q) > struct mt76_queue_buf qbuf; > void *buf = NULL; > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > GFP_ATOMIC); > + buf = mt76_dma_get_rx_buf(q); > if (!buf) > break; > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > index 1a2e4df8d1b5..2924e71e4fbe 100644 > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > @@ -594,13 +594,9 @@ static void > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed) > static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device > *wed) > { > struct mt7915_dev *dev; > - u32 length; > int i; > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > - sizeof(struct skb_shared_info)); > - > for (i = 0; i < dev->mt76.rx_token_size; i++) { > struct mt76_txwi_cache *t; > > @@ -610,7 +606,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct > mtk_wed_device *wed) > > dma_unmap_single(dev->mt76.dma_dev, t->dma_addr, > wed->wlan.rx_size, DMA_FROM_DEVICE); > - __free_pages(virt_to_page(t->ptr), get_order(length)); > + free_page(virt_to_page(t->ptr)); > t->ptr = NULL; > > mt76_put_rxwi(&dev->mt76, t); > @@ -621,13 +617,9 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > mtk_wed_device *wed, int size) > { > struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc; > struct mt7915_dev *dev; > - u32 length; > int i; > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > - sizeof(struct skb_shared_info)); > - > for (i = 0; i < size; i++) { > struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76); > dma_addr_t phy_addr; > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > mtk_wed_device *wed, int size) > int token; > void *ptr; > > - page = __dev_alloc_pages(GFP_KERNEL, > get_order(length)); > + page = __dev_alloc_page(GFP_KERNEL); > if (!page) > goto unmap; > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > mtk_wed_device *wed, int size) > wed->wlan.rx_size, > DMA_TO_DEVICE); > if (unlikely(dma_mapping_error(dev->mt76.dev, > phy_addr))) { > - __free_pages(page, get_order(length)); > + free_page(page); > goto unmap; > } > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > mtk_wed_device *wed, int size) > if (token < 0) { > dma_unmap_single(dev->mt76.dma_dev, phy_addr, > wed->wlan.rx_size, > DMA_TO_DEVICE); > - __free_pages(page, get_order(length)); > + free_page(page); > goto unmap; > } >
> On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote: > > On Dec 21, Felix Fietkau wrote: > > > Hi Sujuan, > > > > > > > Yes, it is so expensive, but if no memcopy, it will casue memory > > > > fragmentation (we hit this issue in internal SQC). > > > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer > > > > manager) with wifi driver(dma rx data queue) by exchanging wfdma > > > > dmad > > > > to ensure the free wed rx buffer. > > > > > > > > it is possiable that a large number of buffer has been exchanged > > > > to wed > > > > and can not come back to wlan driver. So, the memory from the > > > > same 32K > > > > page cache is unable to be released, and it will be failed at > > > > page_frag_alloc in mt76_dma_rx_fill. > > > > > > > > Any ideas but memcopy? > > > > > > A simple solution would be to simply allocate single pages, or > > > half-page fragments. > > > > > > - Felix > > > > > > > A simple approach would be allocating a single page for each rx > > buffer. > > > > @Sujuan: can you please double check if it is ok from performance and > > memory > > fragmentation point of view? If not I guess we can try to > > optimize it > > and allocate multiple buffers in the same page tweeking page > > refcount. > > > > (this patch must be applied on top of Felix's dma fix). > > > > Allocating single page for each rx buffer avoids memory fragmentation, > but it always uses 4K for one rx pkt which only needs 2K, right? correct, we can optimize it allocating multiple buffers (in this case 2, assuming 4K pages) in a single page and recycling the page. > > I guess performance would be worse without page cache. I think it is a trade-off > We have tested on the mtk private driver, 7% drop in throughput when > setting the 4K page cache compared to the 32K page cache. > and 10% drop when use slab to allocate buffer. IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or with a private page_frag_alloc() implementation) and you avoided memory allocation failures due to fragmentation but you got ~ 7% drop in throughput, correct? I think this is quite expected since we need to refill ~ 8 times more the page cache. Not considering memory fragmentation, have you measured the impact of the memcpy of a full buffer? > > A single page per rx buffer may cause a throughput drop of over 7% and > waste memory, what do you think? Implementing the page recycles as it is done in page_frag_alloc() we should get the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to 4K. Regards, Lorenzo > > Regards, > Sujuan > > > Regards, > > Lorenzo > > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > > b/drivers/net/wireless/mediatek/mt76/dma.c > > index 28a7fe064313..1d9e580977fc 100644 > > --- a/drivers/net/wireless/mediatek/mt76/dma.c > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, > > struct mt76_queue *q, > > return ret; > > } > > > > +static void * > > +mt76_dma_get_rx_buf(struct mt76_queue *q) > > +{ > > + if ((q->flags & MT_QFLAG_WED) && > > + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { > > + /* WED RX queue */ > > + struct page *page = dev_alloc_page(); > > + > > + return page ? page_address(page) : NULL; > > + } > > + > > + return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); > > +} > > + > > static int > > mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) > > { > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > > mt76_queue *q) > > struct mt76_queue_buf qbuf; > > void *buf = NULL; > > > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > > GFP_ATOMIC); > > + buf = mt76_dma_get_rx_buf(q); > > if (!buf) > > break; > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > index 1a2e4df8d1b5..2924e71e4fbe 100644 > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > @@ -594,13 +594,9 @@ static void > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed) > > static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device > > *wed) > > { > > struct mt7915_dev *dev; > > - u32 length; > > int i; > > > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > > - sizeof(struct skb_shared_info)); > > - > > for (i = 0; i < dev->mt76.rx_token_size; i++) { > > struct mt76_txwi_cache *t; > > > > @@ -610,7 +606,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct > > mtk_wed_device *wed) > > > > dma_unmap_single(dev->mt76.dma_dev, t->dma_addr, > > wed->wlan.rx_size, DMA_FROM_DEVICE); > > - __free_pages(virt_to_page(t->ptr), get_order(length)); > > + free_page(virt_to_page(t->ptr)); > > t->ptr = NULL; > > > > mt76_put_rxwi(&dev->mt76, t); > > @@ -621,13 +617,9 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > mtk_wed_device *wed, int size) > > { > > struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc; > > struct mt7915_dev *dev; > > - u32 length; > > int i; > > > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > > - sizeof(struct skb_shared_info)); > > - > > for (i = 0; i < size; i++) { > > struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76); > > dma_addr_t phy_addr; > > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > mtk_wed_device *wed, int size) > > int token; > > void *ptr; > > > > - page = __dev_alloc_pages(GFP_KERNEL, > > get_order(length)); > > + page = __dev_alloc_page(GFP_KERNEL); > > if (!page) > > goto unmap; > > > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > mtk_wed_device *wed, int size) > > wed->wlan.rx_size, > > DMA_TO_DEVICE); > > if (unlikely(dma_mapping_error(dev->mt76.dev, > > phy_addr))) { > > - __free_pages(page, get_order(length)); > > + free_page(page); > > goto unmap; > > } > > > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > mtk_wed_device *wed, int size) > > if (token < 0) { > > dma_unmap_single(dev->mt76.dma_dev, phy_addr, > > wed->wlan.rx_size, > > DMA_TO_DEVICE); > > - __free_pages(page, get_order(length)); > > + free_page(page); > > goto unmap; > > } > >
On Thu, 2022-12-22 at 10:44 +0100, lorenzo@kernel.org wrote: > > On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote: > > > On Dec 21, Felix Fietkau wrote: > > > > Hi Sujuan, > > > > > > > > > Yes, it is so expensive, but if no memcopy, it will casue > > > > > memory > > > > > fragmentation (we hit this issue in internal SQC). > > > > > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx > > > > > buffer > > > > > manager) with wifi driver(dma rx data queue) by exchanging > > > > > wfdma > > > > > dmad > > > > > to ensure the free wed rx buffer. > > > > > > > > > > it is possiable that a large number of buffer has been > > > > > exchanged > > > > > to wed > > > > > and can not come back to wlan driver. So, the memory from the > > > > > same 32K > > > > > page cache is unable to be released, and it will be failed at > > > > > page_frag_alloc in mt76_dma_rx_fill. > > > > > > > > > > Any ideas but memcopy? > > > > > > > > A simple solution would be to simply allocate single pages, or > > > > half-page fragments. > > > > > > > > - Felix > > > > > > > > > > A simple approach would be allocating a single page for each rx > > > buffer. > > > > > > @Sujuan: can you please double check if it is ok from performance > > > and > > > memory > > > fragmentation point of view? If not I guess we can try to > > > optimize it > > > and allocate multiple buffers in the same page tweeking page > > > refcount. > > > > > > (this patch must be applied on top of Felix's dma fix). > > > > > > > Allocating single page for each rx buffer avoids memory > > fragmentation, > > but it always uses 4K for one rx pkt which only needs 2K, right? > > correct, we can optimize it allocating multiple buffers (in this case > 2, > assuming 4K pages) in a single page and recycling the page. > > > > > I guess performance would be worse without page cache. > > I think it is a trade-off > > > We have tested on the mtk private driver, 7% drop in throughput > > when > > setting the 4K page cache compared to the 32K page cache. > > and 10% drop when use slab to allocate buffer. > > IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or with a > private > page_frag_alloc() implementation) and you avoided memory allocation > failures due to fragmentation but you got ~ 7% drop in throughput, > correct? > I think this is quite expected since we need to refill ~ 8 times more > the > page cache. > > Not considering memory fragmentation, have you measured the impact of > the > memcpy of a full buffer? > well, for pure sw path, it maybe ~300M drop (not sure) when using memcpy(5G/HE80/4*4). but we do memcpy only when wed on, atfer a few unbinding pkts, then all flows are offloaded by hw. it is also a trade-off~ Regards, Sujuan > > > > A single page per rx buffer may cause a throughput drop of over 7% > > and > > waste memory, what do you think? > > Implementing the page recycles as it is done in page_frag_alloc() we > should get > the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to 4K. > > Regards, > Lorenzo > > > > > Regards, > > Sujuan > > > > > Regards, > > > Lorenzo > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > > > b/drivers/net/wireless/mediatek/mt76/dma.c > > > index 28a7fe064313..1d9e580977fc 100644 > > > --- a/drivers/net/wireless/mediatek/mt76/dma.c > > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, > > > struct mt76_queue *q, > > > return ret; > > > } > > > > > > +static void * > > > +mt76_dma_get_rx_buf(struct mt76_queue *q) > > > +{ > > > + if ((q->flags & MT_QFLAG_WED) && > > > + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { > > > + /* WED RX queue */ > > > + struct page *page = dev_alloc_page(); > > > + > > > + return page ? page_address(page) : NULL; > > > + } > > > + > > > + return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); > > > +} > > > + > > > static int > > > mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) > > > { > > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > > > mt76_queue *q) > > > struct mt76_queue_buf qbuf; > > > void *buf = NULL; > > > > > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > > > GFP_ATOMIC); > > > + buf = mt76_dma_get_rx_buf(q); > > > if (!buf) > > > break; > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > index 1a2e4df8d1b5..2924e71e4fbe 100644 > > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > @@ -594,13 +594,9 @@ static void > > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed) > > > static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device > > > *wed) > > > { > > > struct mt7915_dev *dev; > > > - u32 length; > > > int i; > > > > > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > > > - sizeof(struct skb_shared_info)); > > > - > > > for (i = 0; i < dev->mt76.rx_token_size; i++) { > > > struct mt76_txwi_cache *t; > > > > > > @@ -610,7 +606,7 @@ static void > > > mt7915_mmio_wed_release_rx_buf(struct > > > mtk_wed_device *wed) > > > > > > dma_unmap_single(dev->mt76.dma_dev, t->dma_addr, > > > wed->wlan.rx_size, DMA_FROM_DEVICE); > > > - __free_pages(virt_to_page(t->ptr), get_order(length)); > > > + free_page(virt_to_page(t->ptr)); > > > t->ptr = NULL; > > > > > > mt76_put_rxwi(&dev->mt76, t); > > > @@ -621,13 +617,9 @@ static u32 > > > mt7915_mmio_wed_init_rx_buf(struct > > > mtk_wed_device *wed, int size) > > > { > > > struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc; > > > struct mt7915_dev *dev; > > > - u32 length; > > > int i; > > > > > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > > > - sizeof(struct skb_shared_info)); > > > - > > > for (i = 0; i < size; i++) { > > > struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76); > > > dma_addr_t phy_addr; > > > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > mtk_wed_device *wed, int size) > > > int token; > > > void *ptr; > > > > > > - page = __dev_alloc_pages(GFP_KERNEL, > > > get_order(length)); > > > + page = __dev_alloc_page(GFP_KERNEL); > > > if (!page) > > > goto unmap; > > > > > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > mtk_wed_device *wed, int size) > > > wed->wlan.rx_size, > > > DMA_TO_DEVICE); > > > if (unlikely(dma_mapping_error(dev->mt76.dev, > > > phy_addr))) { > > > - __free_pages(page, get_order(length)); > > > + free_page(page); > > > goto unmap; > > > } > > > > > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > mtk_wed_device *wed, int size) > > > if (token < 0) { > > > dma_unmap_single(dev->mt76.dma_dev, phy_addr, > > > wed->wlan.rx_size, > > > DMA_TO_DEVICE); > > > - __free_pages(page, get_order(length)); > > > + free_page(page); > > > goto unmap; > > > } > > >
> On Thu, 2022-12-22 at 10:44 +0100, lorenzo@kernel.org wrote: > > > On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote: > > > > On Dec 21, Felix Fietkau wrote: > > > > > Hi Sujuan, > > > > > > > > > > > Yes, it is so expensive, but if no memcopy, it will casue > > > > > > memory > > > > > > fragmentation (we hit this issue in internal SQC). > > > > > > > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx > > > > > > buffer > > > > > > manager) with wifi driver(dma rx data queue) by exchanging > > > > > > wfdma > > > > > > dmad > > > > > > to ensure the free wed rx buffer. > > > > > > > > > > > > it is possiable that a large number of buffer has been > > > > > > exchanged > > > > > > to wed > > > > > > and can not come back to wlan driver. So, the memory from the > > > > > > same 32K > > > > > > page cache is unable to be released, and it will be failed at > > > > > > page_frag_alloc in mt76_dma_rx_fill. > > > > > > > > > > > > Any ideas but memcopy? > > > > > > > > > > A simple solution would be to simply allocate single pages, or > > > > > half-page fragments. > > > > > > > > > > - Felix > > > > > > > > > > > > > A simple approach would be allocating a single page for each rx > > > > buffer. > > > > > > > > @Sujuan: can you please double check if it is ok from performance > > > > and > > > > memory > > > > fragmentation point of view? If not I guess we can try to > > > > optimize it > > > > and allocate multiple buffers in the same page tweeking page > > > > refcount. > > > > > > > > (this patch must be applied on top of Felix's dma fix). > > > > > > > > > > Allocating single page for each rx buffer avoids memory > > > fragmentation, > > > but it always uses 4K for one rx pkt which only needs 2K, right? > > > > correct, we can optimize it allocating multiple buffers (in this case > > 2, > > assuming 4K pages) in a single page and recycling the page. > > > > > > > > I guess performance would be worse without page cache. > > > > I think it is a trade-off > > > > > We have tested on the mtk private driver, 7% drop in throughput > > > when > > > setting the 4K page cache compared to the 32K page cache. > > > and 10% drop when use slab to allocate buffer. > > > > IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or with a > > private > > page_frag_alloc() implementation) and you avoided memory allocation > > failures due to fragmentation but you got ~ 7% drop in throughput, > > correct? > > I think this is quite expected since we need to refill ~ 8 times more > > the > > page cache. > > > > Not considering memory fragmentation, have you measured the impact of > > the > > memcpy of a full buffer? > > > > well, for pure sw path, it maybe ~300M drop (not sure) when using > memcpy(5G/HE80/4*4). > but we do memcpy only when wed on, atfer a few unbinding pkts, then all > flows are offloaded by hw. it is also a trade-off~ I guess we have quite a big impact if we enable wed and we do not actually offload any flow (or we do not enable hw flowtable in netfilter), dont' we? Regards, Lorenzo > > Regards, > Sujuan > > > > > > > A single page per rx buffer may cause a throughput drop of over 7% > > > and > > > waste memory, what do you think? > > > > Implementing the page recycles as it is done in page_frag_alloc() we > > should get > > the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to 4K. > > > > Regards, > > Lorenzo > > > > > > > > Regards, > > > Sujuan > > > > > > > Regards, > > > > Lorenzo > > > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > > > > b/drivers/net/wireless/mediatek/mt76/dma.c > > > > index 28a7fe064313..1d9e580977fc 100644 > > > > --- a/drivers/net/wireless/mediatek/mt76/dma.c > > > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > > > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, > > > > struct mt76_queue *q, > > > > return ret; > > > > } > > > > > > > > +static void * > > > > +mt76_dma_get_rx_buf(struct mt76_queue *q) > > > > +{ > > > > + if ((q->flags & MT_QFLAG_WED) && > > > > + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { > > > > + /* WED RX queue */ > > > > + struct page *page = dev_alloc_page(); > > > > + > > > > + return page ? page_address(page) : NULL; > > > > + } > > > > + > > > > + return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); > > > > +} > > > > + > > > > static int > > > > mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) > > > > { > > > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > > > > mt76_queue *q) > > > > struct mt76_queue_buf qbuf; > > > > void *buf = NULL; > > > > > > > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > > > > GFP_ATOMIC); > > > > + buf = mt76_dma_get_rx_buf(q); > > > > if (!buf) > > > > break; > > > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > index 1a2e4df8d1b5..2924e71e4fbe 100644 > > > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > @@ -594,13 +594,9 @@ static void > > > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed) > > > > static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device > > > > *wed) > > > > { > > > > struct mt7915_dev *dev; > > > > - u32 length; > > > > int i; > > > > > > > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > > > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > > > > - sizeof(struct skb_shared_info)); > > > > - > > > > for (i = 0; i < dev->mt76.rx_token_size; i++) { > > > > struct mt76_txwi_cache *t; > > > > > > > > @@ -610,7 +606,7 @@ static void > > > > mt7915_mmio_wed_release_rx_buf(struct > > > > mtk_wed_device *wed) > > > > > > > > dma_unmap_single(dev->mt76.dma_dev, t->dma_addr, > > > > wed->wlan.rx_size, DMA_FROM_DEVICE); > > > > - __free_pages(virt_to_page(t->ptr), get_order(length)); > > > > + free_page(virt_to_page(t->ptr)); > > > > t->ptr = NULL; > > > > > > > > mt76_put_rxwi(&dev->mt76, t); > > > > @@ -621,13 +617,9 @@ static u32 > > > > mt7915_mmio_wed_init_rx_buf(struct > > > > mtk_wed_device *wed, int size) > > > > { > > > > struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc; > > > > struct mt7915_dev *dev; > > > > - u32 length; > > > > int i; > > > > > > > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > > > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > > > > - sizeof(struct skb_shared_info)); > > > > - > > > > for (i = 0; i < size; i++) { > > > > struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76); > > > > dma_addr_t phy_addr; > > > > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > > mtk_wed_device *wed, int size) > > > > int token; > > > > void *ptr; > > > > > > > > - page = __dev_alloc_pages(GFP_KERNEL, > > > > get_order(length)); > > > > + page = __dev_alloc_page(GFP_KERNEL); > > > > if (!page) > > > > goto unmap; > > > > > > > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > > mtk_wed_device *wed, int size) > > > > wed->wlan.rx_size, > > > > DMA_TO_DEVICE); > > > > if (unlikely(dma_mapping_error(dev->mt76.dev, > > > > phy_addr))) { > > > > - __free_pages(page, get_order(length)); > > > > + free_page(page); > > > > goto unmap; > > > > } > > > > > > > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > > mtk_wed_device *wed, int size) > > > > if (token < 0) { > > > > dma_unmap_single(dev->mt76.dma_dev, phy_addr, > > > > wed->wlan.rx_size, > > > > DMA_TO_DEVICE); > > > > - __free_pages(page, get_order(length)); > > > > + free_page(page); > > > > goto unmap; > > > > } > > > >
> > On Thu, 2022-12-22 at 10:44 +0100, lorenzo@kernel.org wrote: > > > > On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote: > > > > > On Dec 21, Felix Fietkau wrote: > > > > > > Hi Sujuan, > > > > > > > > > > > > > Yes, it is so expensive, but if no memcopy, it will casue > > > > > > > memory > > > > > > > fragmentation (we hit this issue in internal SQC). > > > > > > > > > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx > > > > > > > buffer > > > > > > > manager) with wifi driver(dma rx data queue) by exchanging > > > > > > > wfdma > > > > > > > dmad > > > > > > > to ensure the free wed rx buffer. > > > > > > > > > > > > > > it is possiable that a large number of buffer has been > > > > > > > exchanged > > > > > > > to wed > > > > > > > and can not come back to wlan driver. So, the memory from the > > > > > > > same 32K > > > > > > > page cache is unable to be released, and it will be failed at > > > > > > > page_frag_alloc in mt76_dma_rx_fill. > > > > > > > > > > > > > > Any ideas but memcopy? > > > > > > > > > > > > A simple solution would be to simply allocate single pages, or > > > > > > half-page fragments. > > > > > > > > > > > > - Felix > > > > > > > > > > > > > > > > A simple approach would be allocating a single page for each rx > > > > > buffer. > > > > > > > > > > @Sujuan: can you please double check if it is ok from performance > > > > > and > > > > > memory > > > > > fragmentation point of view? If not I guess we can try to > > > > > optimize it > > > > > and allocate multiple buffers in the same page tweeking page > > > > > refcount. > > > > > > > > > > (this patch must be applied on top of Felix's dma fix). > > > > > > > > > > > > > Allocating single page for each rx buffer avoids memory > > > > fragmentation, > > > > but it always uses 4K for one rx pkt which only needs 2K, right? > > > > > > correct, we can optimize it allocating multiple buffers (in this case > > > 2, > > > assuming 4K pages) in a single page and recycling the page. > > > > > > > > > > > I guess performance would be worse without page cache. > > > > > > I think it is a trade-off > > > > > > > We have tested on the mtk private driver, 7% drop in throughput > > > > when > > > > setting the 4K page cache compared to the 32K page cache. > > > > and 10% drop when use slab to allocate buffer. > > > > > > IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or with a > > > private > > > page_frag_alloc() implementation) and you avoided memory allocation > > > failures due to fragmentation but you got ~ 7% drop in throughput, > > > correct? > > > I think this is quite expected since we need to refill ~ 8 times more > > > the > > > page cache. > > > > > > Not considering memory fragmentation, have you measured the impact of > > > the > > > memcpy of a full buffer? > > > > > > > well, for pure sw path, it maybe ~300M drop (not sure) when using > > memcpy(5G/HE80/4*4). > > but we do memcpy only when wed on, atfer a few unbinding pkts, then all > > flows are offloaded by hw. it is also a trade-off~ > > I guess we have quite a big impact if we enable wed and we do not actually > offload any flow (or we do not enable hw flowtable in netfilter), dont' we? Hi Sujuan, here ([0],[1]) I switched mt76 to page_pool allocator [2]. page_pool allows us to allocate pages of a requested order (order 0 for mt76) and split them between multiple buffers (2 buffers per page for mt7915). Moreover pp allocator recycles the pages in order to avoid the page allocator overhead and keep the pages DMA mapped if requested (I implemented the support for mt7915). The code is stable (with and without WED) but I have not run any performance tests. Can you please help with them to see if this code helps solving the discussed issue? Regards, Lorenzo [0] https://github.com/LorenzoBianconi/wireless-drivers-next/commit/61ce8e9be04b40fa8a1c0f2778d266982be32d7a [1] https://github.com/LorenzoBianconi/wireless-drivers-next/commit/f54eefce06d389b155574092d246b21631e42d47 [2] https://docs.kernel.org/networking/page_pool.html > > Regards, > Lorenzo > > > > > Regards, > > Sujuan > > > > > > > > > > A single page per rx buffer may cause a throughput drop of over 7% > > > > and > > > > waste memory, what do you think? > > > > > > Implementing the page recycles as it is done in page_frag_alloc() we > > > should get > > > the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to 4K. > > > > > > Regards, > > > Lorenzo > > > > > > > > > > > Regards, > > > > Sujuan > > > > > > > > > Regards, > > > > > Lorenzo > > > > > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > > > > > b/drivers/net/wireless/mediatek/mt76/dma.c > > > > > index 28a7fe064313..1d9e580977fc 100644 > > > > > --- a/drivers/net/wireless/mediatek/mt76/dma.c > > > > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > > > > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, > > > > > struct mt76_queue *q, > > > > > return ret; > > > > > } > > > > > > > > > > +static void * > > > > > +mt76_dma_get_rx_buf(struct mt76_queue *q) > > > > > +{ > > > > > + if ((q->flags & MT_QFLAG_WED) && > > > > > + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { > > > > > + /* WED RX queue */ > > > > > + struct page *page = dev_alloc_page(); > > > > > + > > > > > + return page ? page_address(page) : NULL; > > > > > + } > > > > > + > > > > > + return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); > > > > > +} > > > > > + > > > > > static int > > > > > mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) > > > > > { > > > > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > > > > > mt76_queue *q) > > > > > struct mt76_queue_buf qbuf; > > > > > void *buf = NULL; > > > > > > > > > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > > > > > GFP_ATOMIC); > > > > > + buf = mt76_dma_get_rx_buf(q); > > > > > if (!buf) > > > > > break; > > > > > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > > index 1a2e4df8d1b5..2924e71e4fbe 100644 > > > > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > > @@ -594,13 +594,9 @@ static void > > > > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed) > > > > > static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device > > > > > *wed) > > > > > { > > > > > struct mt7915_dev *dev; > > > > > - u32 length; > > > > > int i; > > > > > > > > > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > > > > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > > > > > - sizeof(struct skb_shared_info)); > > > > > - > > > > > for (i = 0; i < dev->mt76.rx_token_size; i++) { > > > > > struct mt76_txwi_cache *t; > > > > > > > > > > @@ -610,7 +606,7 @@ static void > > > > > mt7915_mmio_wed_release_rx_buf(struct > > > > > mtk_wed_device *wed) > > > > > > > > > > dma_unmap_single(dev->mt76.dma_dev, t->dma_addr, > > > > > wed->wlan.rx_size, DMA_FROM_DEVICE); > > > > > - __free_pages(virt_to_page(t->ptr), get_order(length)); > > > > > + free_page(virt_to_page(t->ptr)); > > > > > t->ptr = NULL; > > > > > > > > > > mt76_put_rxwi(&dev->mt76, t); > > > > > @@ -621,13 +617,9 @@ static u32 > > > > > mt7915_mmio_wed_init_rx_buf(struct > > > > > mtk_wed_device *wed, int size) > > > > > { > > > > > struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc; > > > > > struct mt7915_dev *dev; > > > > > - u32 length; > > > > > int i; > > > > > > > > > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > > > > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > > > > > - sizeof(struct skb_shared_info)); > > > > > - > > > > > for (i = 0; i < size; i++) { > > > > > struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76); > > > > > dma_addr_t phy_addr; > > > > > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > > > mtk_wed_device *wed, int size) > > > > > int token; > > > > > void *ptr; > > > > > > > > > > - page = __dev_alloc_pages(GFP_KERNEL, > > > > > get_order(length)); > > > > > + page = __dev_alloc_page(GFP_KERNEL); > > > > > if (!page) > > > > > goto unmap; > > > > > > > > > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > > > mtk_wed_device *wed, int size) > > > > > wed->wlan.rx_size, > > > > > DMA_TO_DEVICE); > > > > > if (unlikely(dma_mapping_error(dev->mt76.dev, > > > > > phy_addr))) { > > > > > - __free_pages(page, get_order(length)); > > > > > + free_page(page); > > > > > goto unmap; > > > > > } > > > > > > > > > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > > > > > mtk_wed_device *wed, int size) > > > > > if (token < 0) { > > > > > dma_unmap_single(dev->mt76.dma_dev, phy_addr, > > > > > wed->wlan.rx_size, > > > > > DMA_TO_DEVICE); > > > > > - __free_pages(page, get_order(length)); > > > > > + free_page(page); > > > > > goto unmap; > > > > > } > > > > >
On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote: > On Dec 21, Felix Fietkau wrote: > > Hi Sujuan, > > > > > Yes, it is so expensive, but if no memcopy, it will casue memory > > > fragmentation (we hit this issue in internal SQC). > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed rx buffer > > > manager) with wifi driver(dma rx data queue) by exchanging wfdma > > > dmad > > > to ensure the free wed rx buffer. > > > > > > it is possiable that a large number of buffer has been exchanged > > > to wed > > > and can not come back to wlan driver. So, the memory from the > > > same 32K > > > page cache is unable to be released, and it will be failed at > > > page_frag_alloc in mt76_dma_rx_fill. > > > > > > Any ideas but memcopy? > > > > A simple solution would be to simply allocate single pages, or > > half-page fragments. > > > > - Felix > > > > A simple approach would be allocating a single page for each rx > buffer. > > @Sujuan: can you please double check if it is ok from performance and > memory > fragmentation point of view? If not I guess we can try to > optimize it > and allocate multiple buffers in the same page tweeking page > refcount. > > (this patch must be applied on top of Felix's dma fix). > Allocating single page for each rx buffer avoids memory fragmentation, but it always uses 4K for one rx pkt which only needs 2K, right? I guess performance would be worse without page cache. We have tested on the mtk private driver, 7% drop in throughput when setting the 4K page cache compared to the 32K page cache. and 10% drop when use slab to allocate buffer. A single page per rx buffer may cause a throughput drop of over 7% and waste memory, what do you think? Regards, Sujuan > Regards, > Lorenzo > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > b/drivers/net/wireless/mediatek/mt76/dma.c > index 28a7fe064313..1d9e580977fc 100644 > --- a/drivers/net/wireless/mediatek/mt76/dma.c > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, > struct mt76_queue *q, > return ret; > } > > +static void * > +mt76_dma_get_rx_buf(struct mt76_queue *q) > +{ > + if ((q->flags & MT_QFLAG_WED) && > + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == MT76_WED_Q_RX) { > + /* WED RX queue */ > + struct page *page = dev_alloc_page(); > + > + return page ? page_address(page) : NULL; > + } > + > + return page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); > +} > + > static int > mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) > { > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct > mt76_queue *q) > struct mt76_queue_buf qbuf; > void *buf = NULL; > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > GFP_ATOMIC); > + buf = mt76_dma_get_rx_buf(q); > if (!buf) > break; > > diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > index 1a2e4df8d1b5..2924e71e4fbe 100644 > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > @@ -594,13 +594,9 @@ static void > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed) > static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device > *wed) > { > struct mt7915_dev *dev; > - u32 length; > int i; > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > - sizeof(struct skb_shared_info)); > - > for (i = 0; i < dev->mt76.rx_token_size; i++) { > struct mt76_txwi_cache *t; > > @@ -610,7 +606,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct > mtk_wed_device *wed) > > dma_unmap_single(dev->mt76.dma_dev, t->dma_addr, > wed->wlan.rx_size, DMA_FROM_DEVICE); > - __free_pages(virt_to_page(t->ptr), get_order(length)); > + free_page(virt_to_page(t->ptr)); > t->ptr = NULL; > > mt76_put_rxwi(&dev->mt76, t); > @@ -621,13 +617,9 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > mtk_wed_device *wed, int size) > { > struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc; > struct mt7915_dev *dev; > - u32 length; > int i; > > dev = container_of(wed, struct mt7915_dev, mt76.mmio.wed); > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size + > - sizeof(struct skb_shared_info)); > - > for (i = 0; i < size; i++) { > struct mt76_txwi_cache *t = mt76_get_rxwi(&dev->mt76); > dma_addr_t phy_addr; > @@ -635,7 +627,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > mtk_wed_device *wed, int size) > int token; > void *ptr; > > - page = __dev_alloc_pages(GFP_KERNEL, > get_order(length)); > + page = __dev_alloc_page(GFP_KERNEL); > if (!page) > goto unmap; > > @@ -644,7 +636,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > mtk_wed_device *wed, int size) > wed->wlan.rx_size, > DMA_TO_DEVICE); > if (unlikely(dma_mapping_error(dev->mt76.dev, > phy_addr))) { > - __free_pages(page, get_order(length)); > + free_page(page); > goto unmap; > } > > @@ -653,7 +645,7 @@ static u32 mt7915_mmio_wed_init_rx_buf(struct > mtk_wed_device *wed, int size) > if (token < 0) { > dma_unmap_single(dev->mt76.dma_dev, phy_addr, > wed->wlan.rx_size, > DMA_TO_DEVICE); > - __free_pages(page, get_order(length)); > + free_page(page); > goto unmap; > } >
On Fri, 2022-12-23 at 00:57 +0100, lorenzo@kernel.org wrote: > > > On Thu, 2022-12-22 at 10:44 +0100, lorenzo@kernel.org wrote: > > > > > On Wed, 2022-12-21 at 19:02 +0100, Lorenzo Bianconi wrote: > > > > > > On Dec 21, Felix Fietkau wrote: > > > > > > > Hi Sujuan, > > > > > > > > > > > > > > > Yes, it is so expensive, but if no memcopy, it will > > > > > > > > casue > > > > > > > > memory > > > > > > > > fragmentation (we hit this issue in internal SQC). > > > > > > > > > > > > > > > > as we know, wed needs to exchange rx pkt(belongs to wed > > > > > > > > rx > > > > > > > > buffer > > > > > > > > manager) with wifi driver(dma rx data queue) by > > > > > > > > exchanging > > > > > > > > wfdma > > > > > > > > dmad > > > > > > > > to ensure the free wed rx buffer. > > > > > > > > > > > > > > > > it is possiable that a large number of buffer has been > > > > > > > > exchanged > > > > > > > > to wed > > > > > > > > and can not come back to wlan driver. So, the memory > > > > > > > > from the > > > > > > > > same 32K > > > > > > > > page cache is unable to be released, and it will be > > > > > > > > failed at > > > > > > > > page_frag_alloc in mt76_dma_rx_fill. > > > > > > > > > > > > > > > > Any ideas but memcopy? > > > > > > > > > > > > > > A simple solution would be to simply allocate single > > > > > > > pages, or > > > > > > > half-page fragments. > > > > > > > > > > > > > > - Felix > > > > > > > > > > > > > > > > > > > A simple approach would be allocating a single page for > > > > > > each rx > > > > > > buffer. > > > > > > > > > > > > @Sujuan: can you please double check if it is ok from > > > > > > performance > > > > > > and > > > > > > memory > > > > > > fragmentation point of view? If not I guess we can try > > > > > > to > > > > > > optimize it > > > > > > and allocate multiple buffers in the same page > > > > > > tweeking page > > > > > > refcount. > > > > > > > > > > > > (this patch must be applied on top of Felix's dma fix). > > > > > > > > > > > > > > > > Allocating single page for each rx buffer avoids memory > > > > > fragmentation, > > > > > but it always uses 4K for one rx pkt which only needs 2K, > > > > > right? > > > > > > > > correct, we can optimize it allocating multiple buffers (in > > > > this case > > > > 2, > > > > assuming 4K pages) in a single page and recycling the page. > > > > > > > > > > > > > > I guess performance would be worse without page cache. > > > > > > > > I think it is a trade-off > > > > > > > > > We have tested on the mtk private driver, 7% drop in > > > > > throughput > > > > > when > > > > > setting the 4K page cache compared to the 32K page cache. > > > > > and 10% drop when use slab to allocate buffer. > > > > > > > > IIUC you tested with PAGE_FRAG_CACHE_MAX_SIZE set to 4K (or > > > > with a > > > > private > > > > page_frag_alloc() implementation) and you avoided memory > > > > allocation > > > > failures due to fragmentation but you got ~ 7% drop in > > > > throughput, > > > > correct? > > > > I think this is quite expected since we need to refill ~ 8 > > > > times more > > > > the > > > > page cache. > > > > > > > > Not considering memory fragmentation, have you measured the > > > > impact of > > > > the > > > > memcpy of a full buffer? > > > > > > > > > > well, for pure sw path, it maybe ~300M drop (not sure) when using > > > memcpy(5G/HE80/4*4). > > > but we do memcpy only when wed on, atfer a few unbinding pkts, > > > then all > > > flows are offloaded by hw. it is also a trade-off~ > > > > I guess we have quite a big impact if we enable wed and we do not > > actually > > offload any flow (or we do not enable hw flowtable in netfilter), > > dont' we? > > Hi Sujuan, > > here ([0],[1]) I switched mt76 to page_pool allocator [2]. page_pool > allows us to > allocate pages of a requested order (order 0 for mt76) and split them > between > multiple buffers (2 buffers per page for mt7915). Moreover pp > allocator recycles > the pages in order to avoid the page allocator overhead and keep the > pages DMA mapped > if requested (I implemented the support for mt7915). > The code is stable (with and without WED) but I have not run any > performance > tests. Can you please help with them to see if this code helps > solving the > discussed issue? > Hi Lore, I tried to apply your patches into my codebase(kernel-5.x). However since page pool api coming from kernel-6.x does not exist in kernel- 5.x, I can not build a proper image for testing (even thought we applied 30+ page pool patches). Sorry for not helping you. Reagrds, Sujuan > Regards, > Lorenzo > > [0] > https://github.com/LorenzoBianconi/wireless-drivers-next/commit/61ce8e9be04b40fa8a1c0f2778d266982be32d7a > [1] > https://github.com/LorenzoBianconi/wireless-drivers-next/commit/f54eefce06d389b155574092d246b21631e42d47 > [2] https://docs.kernel.org/networking/page_pool.html > > > > > Regards, > > Lorenzo > > > > > > > > Regards, > > > Sujuan > > > > > > > > > > > > > A single page per rx buffer may cause a throughput drop of > > > > > over 7% > > > > > and > > > > > waste memory, what do you think? > > > > > > > > Implementing the page recycles as it is done in > > > > page_frag_alloc() we > > > > should get > > > > the same results you got with PAGE_FRAG_CACHE_MAX_SIZE set to > > > > 4K. > > > > > > > > Regards, > > > > Lorenzo > > > > > > > > > > > > > > Regards, > > > > > Sujuan > > > > > > > > > > > Regards, > > > > > > Lorenzo > > > > > > > > > > > > diff --git a/drivers/net/wireless/mediatek/mt76/dma.c > > > > > > b/drivers/net/wireless/mediatek/mt76/dma.c > > > > > > index 28a7fe064313..1d9e580977fc 100644 > > > > > > --- a/drivers/net/wireless/mediatek/mt76/dma.c > > > > > > +++ b/drivers/net/wireless/mediatek/mt76/dma.c > > > > > > @@ -580,6 +580,20 @@ mt76_dma_tx_queue_skb(struct mt76_dev > > > > > > *dev, > > > > > > struct mt76_queue *q, > > > > > > return ret; > > > > > > } > > > > > > > > > > > > +static void * > > > > > > +mt76_dma_get_rx_buf(struct mt76_queue *q) > > > > > > +{ > > > > > > + if ((q->flags & MT_QFLAG_WED) && > > > > > > + FIELD_GET(MT_QFLAG_WED_TYPE, q->flags) == > > > > > > MT76_WED_Q_RX) { > > > > > > + /* WED RX queue */ > > > > > > + struct page *page = dev_alloc_page(); > > > > > > + > > > > > > + return page ? page_address(page) : NULL; > > > > > > + } > > > > > > + > > > > > > + return page_frag_alloc(&q->rx_page, q->buf_size, > > > > > > GFP_ATOMIC); > > > > > > +} > > > > > > + > > > > > > static int > > > > > > mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue > > > > > > *q) > > > > > > { > > > > > > @@ -596,7 +610,7 @@ mt76_dma_rx_fill(struct mt76_dev *dev, > > > > > > struct > > > > > > mt76_queue *q) > > > > > > struct mt76_queue_buf qbuf; > > > > > > void *buf = NULL; > > > > > > > > > > > > - buf = page_frag_alloc(&q->rx_page, q->buf_size, > > > > > > GFP_ATOMIC); > > > > > > + buf = mt76_dma_get_rx_buf(q); > > > > > > if (!buf) > > > > > > break; > > > > > > > > > > > > diff --git > > > > > > a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > > > b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > > > index 1a2e4df8d1b5..2924e71e4fbe 100644 > > > > > > --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > > > +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c > > > > > > @@ -594,13 +594,9 @@ static void > > > > > > mt7915_mmio_wed_offload_disable(struct mtk_wed_device *wed) > > > > > > static void mt7915_mmio_wed_release_rx_buf(struct > > > > > > mtk_wed_device > > > > > > *wed) > > > > > > { > > > > > > struct mt7915_dev *dev; > > > > > > - u32 length; > > > > > > int i; > > > > > > > > > > > > dev = container_of(wed, struct mt7915_dev, > > > > > > mt76.mmio.wed); > > > > > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size > > > > > > + > > > > > > - sizeof(struct > > > > > > skb_shared_info)); > > > > > > - > > > > > > for (i = 0; i < dev->mt76.rx_token_size; i++) { > > > > > > struct mt76_txwi_cache *t; > > > > > > > > > > > > @@ -610,7 +606,7 @@ static void > > > > > > mt7915_mmio_wed_release_rx_buf(struct > > > > > > mtk_wed_device *wed) > > > > > > > > > > > > dma_unmap_single(dev->mt76.dma_dev, t- > > > > > > >dma_addr, > > > > > > wed->wlan.rx_size, > > > > > > DMA_FROM_DEVICE); > > > > > > - __free_pages(virt_to_page(t->ptr), > > > > > > get_order(length)); > > > > > > + free_page(virt_to_page(t->ptr)); > > > > > > t->ptr = NULL; > > > > > > > > > > > > mt76_put_rxwi(&dev->mt76, t); > > > > > > @@ -621,13 +617,9 @@ static u32 > > > > > > mt7915_mmio_wed_init_rx_buf(struct > > > > > > mtk_wed_device *wed, int size) > > > > > > { > > > > > > struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc; > > > > > > struct mt7915_dev *dev; > > > > > > - u32 length; > > > > > > int i; > > > > > > > > > > > > dev = container_of(wed, struct mt7915_dev, > > > > > > mt76.mmio.wed); > > > > > > - length = SKB_DATA_ALIGN(NET_SKB_PAD + wed->wlan.rx_size > > > > > > + > > > > > > - sizeof(struct > > > > > > skb_shared_info)); > > > > > > - > > > > > > for (i = 0; i < size; i++) { > > > > > > struct mt76_txwi_cache *t = mt76_get_rxwi(&dev- > > > > > > >mt76); > > > > > > dma_addr_t phy_addr; > > > > > > @@ -635,7 +627,7 @@ static u32 > > > > > > mt7915_mmio_wed_init_rx_buf(struct > > > > > > mtk_wed_device *wed, int size) > > > > > > int token; > > > > > > void *ptr; > > > > > > > > > > > > - page = __dev_alloc_pages(GFP_KERNEL, > > > > > > get_order(length)); > > > > > > + page = __dev_alloc_page(GFP_KERNEL); > > > > > > if (!page) > > > > > > goto unmap; > > > > > > > > > > > > @@ -644,7 +636,7 @@ static u32 > > > > > > mt7915_mmio_wed_init_rx_buf(struct > > > > > > mtk_wed_device *wed, int size) > > > > > > wed->wlan.rx_size, > > > > > > DMA_TO_DEVICE); > > > > > > if (unlikely(dma_mapping_error(dev->mt76.dev, > > > > > > phy_addr))) { > > > > > > - __free_pages(page, get_order(length)); > > > > > > + free_page(page); > > > > > > goto unmap; > > > > > > } > > > > > > > > > > > > @@ -653,7 +645,7 @@ static u32 > > > > > > mt7915_mmio_wed_init_rx_buf(struct > > > > > > mtk_wed_device *wed, int size) > > > > > > if (token < 0) { > > > > > > dma_unmap_single(dev->mt76.dma_dev, > > > > > > phy_addr, > > > > > > wed->wlan.rx_size, > > > > > > DMA_TO_DEVICE); > > > > > > - __free_pages(page, get_order(length)); > > > > > > + free_page(page); > > > > > > goto unmap; > > > > > > } > > > > > > > >
diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index fc24b35..76ad47d 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -580,24 +580,29 @@ mt76_dma_rx_fill(struct mt76_dev *dev, struct mt76_queue *q) buf = page_frag_alloc(&q->rx_page, q->buf_size, GFP_ATOMIC); if (!buf) - break; + goto out; addr = dma_map_single(dev->dma_dev, buf, len, DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(dev->dma_dev, addr))) { - skb_free_frag(buf); - break; - } + if (unlikely(dma_mapping_error(dev->dma_dev, addr))) + goto free; qbuf.addr = addr + offset; qbuf.len = len - offset; qbuf.skip_unmap = false; - if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) { - dma_unmap_single(dev->dma_dev, addr, len, - DMA_FROM_DEVICE); - skb_free_frag(buf); - break; - } + if (mt76_dma_add_buf(dev, q, &qbuf, 1, 0, buf, t) < 0) + goto umap; + frames++; + continue; + +umap: + dma_unmap_single(dev->dma_dev, addr, len, + DMA_FROM_DEVICE); +free: + skb_free_frag(buf); +out: + mt76_put_rxwi(dev, t); + break; } if (frames)