diff mbox series

[net-next,RFC] net: ethernet: mtk_eth_soc: use prefetch methods

Message ID 20240720164621.1983-1-eladwf@gmail.com (mailing list archive)
State New, archived
Headers show
Series [net-next,RFC] net: ethernet: mtk_eth_soc: use prefetch methods | expand

Commit Message

Elad Yifee July 20, 2024, 4:46 p.m. UTC
Utilize kernel prefetch methods for faster cache line access.
This change boosts driver performance,
allowing the CPU to handle about 5% more packets/sec.

Signed-off-by: Elad Yifee <eladwf@gmail.com>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

Comments

Joe Damato July 22, 2024, 4:17 p.m. UTC | #1
On Sat, Jul 20, 2024 at 07:46:18PM +0300, Elad Yifee wrote:
> Utilize kernel prefetch methods for faster cache line access.
> This change boosts driver performance,
> allowing the CPU to handle about 5% more packets/sec.

Nit: It'd be great to see before/after numbers and/or an explanation of
how you measured this in the commit message.

> Signed-off-by: Elad Yifee <eladwf@gmail.com>
> ---
>  drivers/net/ethernet/mediatek/mtk_eth_soc.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> index 0cc2dd85652f..1a0704166103 100644
> --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> @@ -1963,6 +1963,7 @@ static u32 mtk_xdp_run(struct mtk_eth *eth, struct mtk_rx_ring *ring,
>  	if (!prog)
>  		goto out;
>  
> +	prefetchw(xdp->data_hard_start);

Is there any reason to mix net_prefetch (as you have below) with
prefetch and prefetchw ?

IMHO: you should consider using net_prefetch and net_prefetchw
everywhere instead of using both in your code.

>  	act = bpf_prog_run_xdp(prog, xdp);
>  	switch (act) {
>  	case XDP_PASS:
> @@ -2039,7 +2040,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
>  		idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
>  		rxd = ring->dma + idx * eth->soc->rx.desc_size;
>  		data = ring->data[idx];
> -
> +		prefetch(rxd);

Maybe net_prefetch instead, as mentioned above?

>  		if (!mtk_rx_get_desc(eth, &trxd, rxd))
>  			break;
>  
> @@ -2105,6 +2106,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
>  			if (ret != XDP_PASS)
>  				goto skip_rx;
>  
> +			net_prefetch(xdp.data_meta);
>  			skb = build_skb(data, PAGE_SIZE);
>  			if (unlikely(!skb)) {
>  				page_pool_put_full_page(ring->page_pool,
> @@ -2113,6 +2115,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
>  				goto skip_rx;
>  			}
>  
> +			prefetchw(skb->data);

Maybe net_prefetchw instead, as mentioned above?

>  			skb_reserve(skb, xdp.data - xdp.data_hard_start);
>  			skb_put(skb, xdp.data_end - xdp.data);
>  			skb_mark_for_recycle(skb);
> @@ -2143,6 +2146,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
>  			dma_unmap_single(eth->dma_dev, ((u64)trxd.rxd1 | addr64),
>  					 ring->buf_size, DMA_FROM_DEVICE);
>  
> +			net_prefetch(data);
>  			skb = build_skb(data, ring->frag_size);
>  			if (unlikely(!skb)) {
>  				netdev->stats.rx_dropped++;
> @@ -2150,6 +2154,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
>  				goto skip_rx;
>  			}
>  
> +			prefetchw(skb->data);

Maybe net_prefetchw instead, as mentioned above?
Elad Yifee July 22, 2024, 6:04 p.m. UTC | #2
On Mon, Jul 22, 2024 at 7:17 PM Joe Damato <jdamato@fastly.com> wrote:
>
> On Sat, Jul 20, 2024 at 07:46:18PM +0300, Elad Yifee wrote:
> > Utilize kernel prefetch methods for faster cache line access.
> > This change boosts driver performance,
> > allowing the CPU to handle about 5% more packets/sec.
>
> Nit: It'd be great to see before/after numbers and/or an explanation of
> how you measured this in the commit message.
Sure, I'll add iperf3 results in the next version.

> Is there any reason to mix net_prefetch (as you have below) with
> prefetch and prefetchw ?
>
> IMHO: you should consider using net_prefetch and net_prefetchw
> everywhere instead of using both in your code.
You are right, honestly I didn't notice it exists.
I'll replace all prefetchw with net_prefetchw.

> > @@ -2039,7 +2040,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
> >               idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
> >               rxd = ring->dma + idx * eth->soc->rx.desc_size;
> >               data = ring->data[idx];
> > -
> > +             prefetch(rxd);
>
> Maybe net_prefetch instead, as mentioned above?
This is the only case where I think prefetch should be used since it's
only the descriptor.

Thank you for your suggestions
Joe Damato July 22, 2024, 6:26 p.m. UTC | #3
On Mon, Jul 22, 2024 at 09:04:06PM +0300, Elad Yifee wrote:
> On Mon, Jul 22, 2024 at 7:17 PM Joe Damato <jdamato@fastly.com> wrote:
> >
> > On Sat, Jul 20, 2024 at 07:46:18PM +0300, Elad Yifee wrote:
> > > Utilize kernel prefetch methods for faster cache line access.
> > > This change boosts driver performance,
> > > allowing the CPU to handle about 5% more packets/sec.
> >
> > Nit: It'd be great to see before/after numbers and/or an explanation of
> > how you measured this in the commit message.
> Sure, I'll add iperf3 results in the next version.

Thanks, that'd be helpful!

[...]

> > > @@ -2039,7 +2040,7 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget,
> > >               idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
> > >               rxd = ring->dma + idx * eth->soc->rx.desc_size;
> > >               data = ring->data[idx];
> > > -
> > > +             prefetch(rxd);
> >
> > Maybe net_prefetch instead, as mentioned above?
> This is the only case where I think prefetch should be used since it's
> only the descriptor.

I think you are implying that the optimization in the case of
L1_CACHE_BYTES < 128 is unnecessary because because the
mtk_rx_dma_v2 descriptors will be too far (i *
eth->soc->rx.desc_size) apart to get any benefit from prefetching
more data ?

If my understanding is correct, then yes: I agree.

> Thank you for your suggestions

No problem!
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 0cc2dd85652f..1a0704166103 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1963,6 +1963,7 @@  static u32 mtk_xdp_run(struct mtk_eth *eth, struct mtk_rx_ring *ring,
 	if (!prog)
 		goto out;
 
+	prefetchw(xdp->data_hard_start);
 	act = bpf_prog_run_xdp(prog, xdp);
 	switch (act) {
 	case XDP_PASS:
@@ -2039,7 +2040,7 @@  static int mtk_poll_rx(struct napi_struct *napi, int budget,
 		idx = NEXT_DESP_IDX(ring->calc_idx, ring->dma_size);
 		rxd = ring->dma + idx * eth->soc->rx.desc_size;
 		data = ring->data[idx];
-
+		prefetch(rxd);
 		if (!mtk_rx_get_desc(eth, &trxd, rxd))
 			break;
 
@@ -2105,6 +2106,7 @@  static int mtk_poll_rx(struct napi_struct *napi, int budget,
 			if (ret != XDP_PASS)
 				goto skip_rx;
 
+			net_prefetch(xdp.data_meta);
 			skb = build_skb(data, PAGE_SIZE);
 			if (unlikely(!skb)) {
 				page_pool_put_full_page(ring->page_pool,
@@ -2113,6 +2115,7 @@  static int mtk_poll_rx(struct napi_struct *napi, int budget,
 				goto skip_rx;
 			}
 
+			prefetchw(skb->data);
 			skb_reserve(skb, xdp.data - xdp.data_hard_start);
 			skb_put(skb, xdp.data_end - xdp.data);
 			skb_mark_for_recycle(skb);
@@ -2143,6 +2146,7 @@  static int mtk_poll_rx(struct napi_struct *napi, int budget,
 			dma_unmap_single(eth->dma_dev, ((u64)trxd.rxd1 | addr64),
 					 ring->buf_size, DMA_FROM_DEVICE);
 
+			net_prefetch(data);
 			skb = build_skb(data, ring->frag_size);
 			if (unlikely(!skb)) {
 				netdev->stats.rx_dropped++;
@@ -2150,6 +2154,7 @@  static int mtk_poll_rx(struct napi_struct *napi, int budget,
 				goto skip_rx;
 			}
 
+			prefetchw(skb->data);
 			skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
 			skb_put(skb, pktlen);
 		}