diff mbox series

[1/1] net: fec: add initial XDP support

Message ID 20220928152509.141490-1-shenwei.wang@nxp.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [1/1] net: fec: add initial XDP support | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 1 maintainers not CCed: bpf@vger.kernel.org
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Shenwei Wang Sept. 28, 2022, 3:25 p.m. UTC
This patch adds the initial XDP support to Freescale driver. It supports
XDP_PASS, XDP_DROP and XDP_REDIRECT actions. Upcoming patches will add
support for XDP_TX and Zero Copy features.

This patch also optimizes the RX buffers by using the page pool, which
uses one frame per page for easy management. In the future, it can be
further improved to use two frames per page.

This patch has been tested with the sample apps of "xdpsock" and "xdp2" in
samples/bpf directory for both SKB and Native (XDP) mode. The following
are the testing result comparing with the XDP skb-mode.

 # xdpsock -i eth0
 sock0@eth0:0 rxdrop xdp-drv
                   pps            pkts           1.00
 rx                 198798         1040011
 tx                 0              0

 # xdpsock -S -i eth0         // skb-mode
 sock0@eth0:0 rxdrop xdp-skb
                    pps            pkts           1.00
 rx                 95638          717251
 tx                 0              0

 # xdp2 eth0
 proto 0:     475362 pkt/s
 proto 0:     475549 pkt/s
 proto 0:     475480 pkt/s
 proto 0:     143258 pkt/s

 # xdp2 -S eth0    // skb-mode
 proto 17:      56468 pkt/s
 proto 17:      71999 pkt/s
 proto 17:      72000 pkt/s
 proto 17:      71988 pkt/s

Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
---
 drivers/net/ethernet/freescale/fec.h      |  34 +-
 drivers/net/ethernet/freescale/fec_main.c | 414 +++++++++++++++++++---
 2 files changed, 393 insertions(+), 55 deletions(-)

Comments

kernel test robot Sept. 28, 2022, 9:18 p.m. UTC | #1
Hi Shenwei,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]
[also build test WARNING on next-20220927]
[cannot apply to net/master linus/master v6.0-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Shenwei-Wang/net-fec-add-initial-XDP-support/20220928-232917
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git b9a5cbf8ba24e88071a97a51a09ef5cdf0d1f6a1
config: arm-imx_v4_v5_defconfig
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 791a7ae1ba3efd6bca96338e10ffde557ba83920)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm cross compiling tool for clang build
        # apt-get install binutils-arm-linux-gnueabi
        # https://github.com/intel-lab-lkp/linux/commit/0ef976d16b5d6993db5c83926eaa836fc91a6450
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Shenwei-Wang/net-fec-add-initial-XDP-support/20220928-232917
        git checkout 0ef976d16b5d6993db5c83926eaa836fc91a6450
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash drivers/net/ethernet/freescale/ net/core/ net/ethtool/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/net/ethernet/freescale/fec_main.c:3775:9: warning: variable 'nxmit' set but not used [-Wunused-but-set-variable]
           int i, nxmit = 0;
                  ^
   1 warning generated.


vim +/nxmit +3775 drivers/net/ethernet/freescale/fec_main.c

  3764	
  3765	static int fec_enet_xdp_xmit(struct net_device *dev,
  3766				     int num_frames,
  3767				     struct xdp_frame **frames,
  3768				     u32 flags)
  3769	{
  3770		struct fec_enet_private *fep = netdev_priv(dev);
  3771		struct fec_enet_priv_tx_q *txq;
  3772		int cpu = smp_processor_id();
  3773		struct netdev_queue *nq;
  3774		unsigned int queue;
> 3775		int i, nxmit = 0;
  3776	
  3777		queue = fec_enet_xdp_get_tx_queue(fep, cpu);
  3778		txq = fep->tx_queue[queue];
  3779		nq = netdev_get_tx_queue(fep->netdev, queue);
  3780	
  3781		__netif_tx_lock(nq, cpu);
  3782	
  3783		for (i = 0; i < num_frames; i++) {
  3784			fec_enet_txq_xmit_frame(fep, txq, frames[i]);
  3785			nxmit++;
  3786		}
  3787	
  3788		/* Make sure the update to bdp and tx_skbuff are performed. */
  3789		wmb();
  3790	
  3791		/* Trigger transmission start */
  3792		writel(0, txq->bd.reg_desc_active);
  3793	
  3794		__netif_tx_unlock(nq);
  3795	
  3796		return num_frames;
  3797	}
  3798
Andrew Lunn Sept. 29, 2022, 1:33 a.m. UTC | #2
On Wed, Sep 28, 2022 at 10:25:09AM -0500, Shenwei Wang wrote:
> This patch adds the initial XDP support to Freescale driver. It supports
> XDP_PASS, XDP_DROP and XDP_REDIRECT actions. Upcoming patches will add
> support for XDP_TX and Zero Copy features.
> 
> This patch also optimizes the RX buffers by using the page pool, which
> uses one frame per page for easy management. In the future, it can be
> further improved to use two frames per page.

Please could you split this patch up. It is rather large and hard to
review. I think you can first add support for the page pool, and then
add XDP support, for example. 

I would be interesting to see how the page pool helps performance for
normal traffic, since that is what most people use it for. And for a
range of devices, since we need to make sure it does not cause any
regressions for older devices.

	    Andrew
Andrew Lunn Sept. 29, 2022, 1:50 a.m. UTC | #3
> +struct fec_enet_xdp_stats {
> +	u64	xdp_pass;
> +	u64	xdp_drop;
> +	u64	xdp_xmit;
> +	u64	xdp_redirect;
> +	u64	xdp_xmit_err;
> +	u64	xdp_tx;
> +	u64	xdp_tx_err;
> +};
> +
> +	switch (act) {
> +	case XDP_PASS:
> +		rxq->stats.xdp_pass++;

Since the stats are u64, and most machines using the FEC are 32 bit,
you cannot just do an increment. Took a look at u64_stats_sync.h.

> -#define FEC_STATS_SIZE		(ARRAY_SIZE(fec_stats) * sizeof(u64))
> +static struct fec_xdp_stat {
> +	char name[ETH_GSTRING_LEN];
> +	u32 count;
> +} fec_xdp_stats[] = {
> +	{ "rx_xdp_redirect", 0 },
> +	{ "rx_xdp_pass", 0 },
> +	{ "rx_xdp_drop", 0 },
> +	{ "rx_xdp_tx", 0 },
> +	{ "rx_xdp_tx_errors", 0 },
> +	{ "tx_xdp_xmit", 0 },
> +	{ "tx_xdp_xmit_errors", 0 },
> +};
> +
> +#define FEC_STATS_SIZE	((ARRAY_SIZE(fec_stats) + \
> +			ARRAY_SIZE(fec_xdp_stats)) * sizeof(u64))

The page pool also has some stats. See page_pool_get_stats(),
page_pool_ethtool_stats_get_strings() etc.

      Andrew
kernel test robot Sept. 29, 2022, 2:43 a.m. UTC | #4
Hi Shenwei,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]
[also build test WARNING on next-20220928]
[cannot apply to net/master linus/master v6.0-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Shenwei-Wang/net-fec-add-initial-XDP-support/20220928-232917
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git b9a5cbf8ba24e88071a97a51a09ef5cdf0d1f6a1
config: parisc-randconfig-m031-20220925
compiler: hppa64-linux-gcc (GCC) 12.1.0

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

New smatch warnings:
drivers/net/ethernet/freescale/fec_main.c:3285 fec_enet_alloc_rxq_buffers() warn: unsigned 'err' is never less than zero.

Old smatch warnings:
drivers/net/ethernet/freescale/fec_main.c:3658 fec_enet_select_queue() warn: potential spectre issue 'fec_enet_vlan_pri_to_queue' [r]

vim +/err +3285 drivers/net/ethernet/freescale/fec_main.c

  3269	
  3270	static int
  3271	fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
  3272	{
  3273		struct fec_enet_private *fep = netdev_priv(ndev);
  3274		unsigned int i, err;
  3275		struct bufdesc	*bdp;
  3276		struct fec_enet_priv_rx_q *rxq;
  3277	
  3278		dma_addr_t phys_addr;
  3279		struct page *page;
  3280	
  3281		rxq = fep->rx_queue[queue];
  3282		bdp = rxq->bd.base;
  3283	
  3284		err = fec_enet_create_page_pool(fep, rxq, rxq->bd.ring_size);
> 3285		if (err < 0) {
  3286			netdev_err(ndev, "%s failed queue %d (%d)\n", __func__, queue, err);
  3287			return err;
  3288		}
  3289	
  3290		for (i = 0; i < rxq->bd.ring_size; i++) {
  3291			page = page_pool_dev_alloc_pages(rxq->page_pool);
  3292			if (!page)
  3293				goto err_alloc;
  3294	
  3295			phys_addr = page_pool_get_dma_addr(page) + FEC_ENET_XDP_HEADROOM;
  3296			bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
  3297	
  3298			rxq->rx_skb_info[i].page = page;
  3299			rxq->rx_skb_info[i].offset = FEC_ENET_XDP_HEADROOM;
  3300			bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY);
  3301	
  3302			if (fep->bufdesc_ex) {
  3303				struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
  3304				ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
  3305			}
  3306	
  3307			bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
  3308		}
  3309	
  3310		/* Set the last buffer to wrap. */
  3311		bdp = fec_enet_get_prevdesc(bdp, &rxq->bd);
  3312		bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
  3313		return 0;
  3314	
  3315	 err_alloc:
  3316		fec_enet_free_buffers(ndev);
  3317		return -ENOMEM;
  3318	}
  3319
Jesper Dangaard Brouer Sept. 29, 2022, 10:16 a.m. UTC | #5
On 28/09/2022 17.25, Shenwei Wang wrote:
> This patch adds the initial XDP support to Freescale driver. It supports
> XDP_PASS, XDP_DROP and XDP_REDIRECT actions. Upcoming patches will add
> support for XDP_TX and Zero Copy features.
> 
> This patch also optimizes the RX buffers by using the page pool, which
> uses one frame per page for easy management. In the future, it can be
> further improved to use two frames per page.
> 
> This patch has been tested with the sample apps of "xdpsock" and "xdp2" in
> samples/bpf directory for both SKB and Native (XDP) mode. The following
> are the testing result comparing with the XDP skb-mode.
> 
>   # xdpsock -i eth0
>   sock0@eth0:0 rxdrop xdp-drv
>                     pps            pkts           1.00
>   rx                 198798         1040011
>   tx                 0              0
> 
>   # xdpsock -S -i eth0         // skb-mode
>   sock0@eth0:0 rxdrop xdp-skb
>                      pps            pkts           1.00
>   rx                 95638          717251
>   tx                 0              0
> 
>   # xdp2 eth0
>   proto 0:     475362 pkt/s
>   proto 0:     475549 pkt/s
>   proto 0:     475480 pkt/s
>   proto 0:     143258 pkt/s
> 
>   # xdp2 -S eth0    // skb-mode
>   proto 17:      56468 pkt/s
>   proto 17:      71999 pkt/s
>   proto 17:      72000 pkt/s
>   proto 17:      71988 pkt/s
> 
> Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
> ---
>   drivers/net/ethernet/freescale/fec.h      |  34 +-
>   drivers/net/ethernet/freescale/fec_main.c | 414 +++++++++++++++++++---
>   2 files changed, 393 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
> index b0100fe3c9e4..f7531503aa95 100644
> --- a/drivers/net/ethernet/freescale/fec.h
> +++ b/drivers/net/ethernet/freescale/fec.h
> @@ -346,8 +346,10 @@ struct bufdesc_ex {
>    * the skbuffer directly.
>    */
>   
> +#define FEC_ENET_XDP_HEADROOM	(512) /* XDP_PACKET_HEADROOM + NET_IP_ALIGN) */

Why the large headroom?

> +
>   #define FEC_ENET_RX_PAGES	256
> -#define FEC_ENET_RX_FRSIZE	2048
> +#define FEC_ENET_RX_FRSIZE	(PAGE_SIZE - FEC_ENET_XDP_HEADROOM)

This FEC_ENET_RX_FRSIZE is likely wrong, because you also need to
reserve 320 bytes at the end for struct skb_shared_info.
(320 calculated as SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

>   #define FEC_ENET_RX_FRPPG	(PAGE_SIZE / FEC_ENET_RX_FRSIZE)
>   #define RX_RING_SIZE		(FEC_ENET_RX_FRPPG * FEC_ENET_RX_PAGES)
>   #define FEC_ENET_TX_FRSIZE	2048
> @@ -517,6 +519,22 @@ struct bufdesc_prop {
[...]

> diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
> index 59921218a8a4..2e30182ed770 100644
> --- a/drivers/net/ethernet/freescale/fec_main.c
> +++ b/drivers/net/ethernet/freescale/fec_main.c
> @@ -66,6 +66,8 @@
>   #include <linux/mfd/syscon.h>
>   #include <linux/regmap.h>
>   #include <soc/imx/cpuidle.h>
> +#include <linux/filter.h>
> +#include <linux/bpf.h>
>   
>   #include <asm/cacheflush.h>
>   
> @@ -87,6 +89,11 @@ static const u16 fec_enet_vlan_pri_to_queue[8] = {0, 0, 1, 1, 1, 2, 2, 2};
>   #define FEC_ENET_OPD_V	0xFFF0
>   #define FEC_MDIO_PM_TIMEOUT  100 /* ms */
>   
> +#define FEC_ENET_XDP_PASS          0
> +#define FEC_ENET_XDP_CONSUMED      BIT(0)
> +#define FEC_ENET_XDP_TX            BIT(1)
> +#define FEC_ENET_XDP_REDIR         BIT(2)
> +
>   struct fec_devinfo {
>   	u32 quirks;
>   };
> @@ -422,6 +429,49 @@ fec_enet_clear_csum(struct sk_buff *skb, struct net_device *ndev)
>   	return 0;
>   }
>   
> +static int
> +fec_enet_create_page_pool(struct fec_enet_private *fep,
> +			  struct fec_enet_priv_rx_q *rxq, int size)
> +{
> +	struct bpf_prog *xdp_prog = READ_ONCE(fep->xdp_prog);
> +	struct page_pool_params pp_params = {
> +		.order = 0,
> +		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
> +		.pool_size = size,
> +		.nid = dev_to_node(&fep->pdev->dev),
> +		.dev = &fep->pdev->dev,
> +		.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE,
> +		.offset = FEC_ENET_XDP_HEADROOM,
> +		.max_len = FEC_ENET_RX_FRSIZE,

XDP BPF-prog cannot access last 320 bytes, so FEC_ENET_RX_FRSIZE is 
wrong here.

> +	};
> +	int err;
> +
> +	rxq->page_pool = page_pool_create(&pp_params);
> +	if (IS_ERR(rxq->page_pool)) {
> +		err = PTR_ERR(rxq->page_pool);
> +		rxq->page_pool = NULL;
> +		return err;
> +	}
> +
> +	err = xdp_rxq_info_reg(&rxq->xdp_rxq, fep->netdev, rxq->id, 0);
> +	if (err < 0)
> +		goto err_free_pp;
> +
> +	err = xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, MEM_TYPE_PAGE_POOL,
> +					 rxq->page_pool);
> +	if (err)
> +		goto err_unregister_rxq;
> +
> +	return 0;
> +
> +err_unregister_rxq:
> +	xdp_rxq_info_unreg(&rxq->xdp_rxq);
> +err_free_pp:
> +	page_pool_destroy(rxq->page_pool);
> +	rxq->page_pool = NULL;
> +	return err;
> +}
> +
>   static struct bufdesc *
>   fec_enet_txq_submit_frag_skb(struct fec_enet_priv_tx_q *txq,
>   			     struct sk_buff *skb,
> @@ -1285,7 +1335,6 @@ fec_stop(struct net_device *ndev)
>   	}
>   }
>   
> -
>   static void
>   fec_timeout(struct net_device *ndev, unsigned int txqueue)
>   {
> @@ -1450,7 +1499,7 @@ static void fec_enet_tx(struct net_device *ndev)
>   		fec_enet_tx_queue(ndev, i);
>   }
>   
> -static int
> +static int __maybe_unused
>   fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff *skb)
>   {
>   	struct  fec_enet_private *fep = netdev_priv(ndev);
> @@ -1470,8 +1519,9 @@ fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff
>   	return 0;
>   }
>   
> -static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
> -			       struct bufdesc *bdp, u32 length, bool swap)
> +static bool __maybe_unused
> +fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
> +		   struct bufdesc *bdp, u32 length, bool swap)
>   {
>   	struct  fec_enet_private *fep = netdev_priv(ndev);
>   	struct sk_buff *new_skb;
> @@ -1496,6 +1546,78 @@ static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
>   	return true;
>   }
>   
> +static void fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq,
> +				struct bufdesc *bdp, int index)
> +{
> +	struct page *new_page;
> +	dma_addr_t phys_addr;
> +
> +	new_page = page_pool_dev_alloc_pages(rxq->page_pool);
> +	WARN_ON(!new_page);
> +	rxq->rx_skb_info[index].page = new_page;
> +
> +	rxq->rx_skb_info[index].offset = FEC_ENET_XDP_HEADROOM;
> +	phys_addr = page_pool_get_dma_addr(new_page) + FEC_ENET_XDP_HEADROOM;
> +	bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
> +}
> +
> +static u32
> +fec_enet_run_xdp(struct fec_enet_private *fep, struct bpf_prog *prog,
> +		 struct xdp_buff *xdp, struct fec_enet_priv_rx_q *rxq, int index)
> +{
> +	unsigned int sync, len = xdp->data_end - xdp->data;
> +	u32 ret = FEC_ENET_XDP_PASS;
> +	struct page *page;
> +	int err;
> +	u32 act;
> +
> +	act = bpf_prog_run_xdp(prog, xdp);
> +
> +	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
> +	sync = xdp->data_end - xdp->data_hard_start - FEC_ENET_XDP_HEADROOM;
> +	sync = max(sync, len);
> +
> +	switch (act) {
> +	case XDP_PASS:
> +		rxq->stats.xdp_pass++;
> +		ret = FEC_ENET_XDP_PASS;
> +		break;
> +
> +	case XDP_TX:
> +		rxq->stats.xdp_tx++;
> +		bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
> +		fallthrough;

This fallthrough looks wrong. The next xdp_do_redirect() call will
pickup left-overs in per CPU bpf_redirect_info.

> +
> +	case XDP_REDIRECT:
> +		err = xdp_do_redirect(fep->netdev, xdp, prog);
> +		rxq->stats.xdp_redirect++;
> +		if (!err) {
> +			ret = FEC_ENET_XDP_REDIR;
> +		} else {
> +			ret = FEC_ENET_XDP_CONSUMED;
> +			page = virt_to_head_page(xdp->data);
> +			page_pool_put_page(rxq->page_pool, page, sync, true);
> +		}
> +		break;
> +
> +	default:
> +		bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
> +		fallthrough;
> +
> +	case XDP_ABORTED:
> +		fallthrough;    /* handle aborts by dropping packet */
> +
> +	case XDP_DROP:
> +		rxq->stats.xdp_drop++;
> +		ret = FEC_ENET_XDP_CONSUMED;
> +		page = virt_to_head_page(xdp->data);
> +		page_pool_put_page(rxq->page_pool, page, sync, true);
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
>   /* During a receive, the bd_rx.cur points to the current incoming buffer.
>    * When we update through the ring, if the next incoming buffer has
>    * not been given to the system, we just set the empty indicator,
> @@ -1508,7 +1630,6 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
>   	struct fec_enet_priv_rx_q *rxq;
>   	struct bufdesc *bdp;
>   	unsigned short status;
> -	struct  sk_buff *skb_new = NULL;
>   	struct  sk_buff *skb;
>   	ushort	pkt_len;
>   	__u8 *data;
> @@ -1517,8 +1638,12 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
>   	bool	vlan_packet_rcvd = false;
>   	u16	vlan_tag;
>   	int	index = 0;
> -	bool	is_copybreak;
>   	bool	need_swap = fep->quirks & FEC_QUIRK_SWAP_FRAME;
> +	struct page *page;
> +	struct xdp_buff xdp;
> +	u32 ret, xdp_result = FEC_ENET_XDP_PASS;
> +
> +	struct bpf_prog *xdp_prog = READ_ONCE(fep->xdp_prog);
>   
>   #ifdef CONFIG_M532x
>   	flush_cache_all();
> @@ -1529,6 +1654,7 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
>   	 * These get messed up if we get called due to a busy condition.
>   	 */
>   	bdp = rxq->bd.cur;
> +	xdp_init_buff(&xdp, PAGE_SIZE, &rxq->xdp_rxq);
>   
>   	while (!((status = fec16_to_cpu(bdp->cbd_sc)) & BD_ENET_RX_EMPTY)) {
>   
> @@ -1570,31 +1696,37 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
>   		ndev->stats.rx_bytes += pkt_len;
>   
>   		index = fec_enet_get_bd_index(bdp, &rxq->bd);
> -		skb = rxq->rx_skbuff[index];
> +		page = rxq->rx_skb_info[index].page;
> +
> +		dma_sync_single_for_cpu(&fep->pdev->dev,
> +					fec32_to_cpu(bdp->cbd_bufaddr),
> +					pkt_len,
> +					DMA_FROM_DEVICE);
> +
> +		prefetch(page_address(page));
> +		fec_enet_update_cbd(rxq, bdp, index);
> +
> +		if (xdp_prog) {
> +			xdp_prepare_buff(&xdp, page_address(page),
> +					 FEC_ENET_XDP_HEADROOM, pkt_len, false);
> +
> +			ret = fec_enet_run_xdp(fep, xdp_prog, &xdp, rxq, index);
> +			xdp_result |= ret;
> +			if (ret != FEC_ENET_XDP_PASS)
> +				goto rx_processing_done;
> +		}
>   
>   		/* The packet length includes FCS, but we don't want to
>   		 * include that when passing upstream as it messes up
>   		 * bridging applications.
>   		 */
> -		is_copybreak = fec_enet_copybreak(ndev, &skb, bdp, pkt_len - 4,
> -						  need_swap);
> -		if (!is_copybreak) {
> -			skb_new = netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE);
> -			if (unlikely(!skb_new)) {
> -				ndev->stats.rx_dropped++;
> -				goto rx_processing_done;
> -			}
> -			dma_unmap_single(&fep->pdev->dev,
> -					 fec32_to_cpu(bdp->cbd_bufaddr),
> -					 FEC_ENET_RX_FRSIZE - fep->rx_align,
> -					 DMA_FROM_DEVICE);
> -		}
> -
> -		prefetch(skb->data - NET_IP_ALIGN);
> +		skb = build_skb(page_address(page), FEC_ENET_RX_FRSIZE);

This looks wrong, I think FEC_ENET_RX_FRSIZE should be replaced by 
PAGE_SIZE.
As build_skb() does:

  size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

> +		skb_reserve(skb, FEC_ENET_XDP_HEADROOM);

The skb_reserve looks correct.

>   		skb_put(skb, pkt_len - 4);
>   		data = skb->data;
> +		page_pool_release_page(rxq->page_pool, page);

Today page_pool have SKB recycle support (you might have looked at
drivers that didn't utilize this yet), thus you don't need to release
the page (page_pool_release_page) here.  Instead you could simply mark
the SKB for recycling, unless driver does some page refcnt tricks I
didn't notice.

  skb_mark_for_recycle(skb);


> -		if (!is_copybreak && need_swap)
> +		if (need_swap)
>   			swap_buffer(data, pkt_len);
>   
>   #if !defined(CONFIG_M5272)
> @@ -1649,16 +1781,6 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
[...]
Shenwei Wang Sept. 29, 2022, 12:40 p.m. UTC | #6
> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Wednesday, September 28, 2022 8:34 PM
> To: Shenwei Wang <shenwei.wang@nxp.com>
> Cc: Joakim Zhang <qiangqing.zhang@nxp.com>; David S. Miller
> <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev
> Subject: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> Caution: EXT Email
> 
> On Wed, Sep 28, 2022 at 10:25:09AM -0500, Shenwei Wang wrote:
> > This patch adds the initial XDP support to Freescale driver. It
> > supports XDP_PASS, XDP_DROP and XDP_REDIRECT actions. Upcoming
> patches
> > will add support for XDP_TX and Zero Copy features.
> >
> > This patch also optimizes the RX buffers by using the page pool, which
> > uses one frame per page for easy management. In the future, it can be
> > further improved to use two frames per page.
> 
> Please could you split this patch up. It is rather large and hard to review. I think
> you can first add support for the page pool, and then add XDP support, for
> example.
> 

Will do it in the next version. Thanks for the review.

> I would be interesting to see how the page pool helps performance for normal
> traffic, since that is what most people use it for. And for a range of devices,
> since we need to make sure it does not cause any regressions for older devices.
> 

I actually did some compare testing regarding the page pool for normal traffic. 
So far I don't see significant improvement in the current implementation. The performance for large packets improves a little, and the performance for small packets get a little worse.

Thanks,
Shenwei

>             Andrew
Shenwei Wang Sept. 29, 2022, 12:46 p.m. UTC | #7
> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Wednesday, September 28, 2022 8:51 PM
> To: Shenwei Wang <shenwei.wang@nxp.com>
> Cc: Joakim Zhang <qiangqing.zhang@nxp.com>; David S. Miller
> <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev
> Subject: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> Caution: EXT Email
> 
> > +struct fec_enet_xdp_stats {
> > +     u64     xdp_pass;
> > +     u64     xdp_drop;
> > +     u64     xdp_xmit;
> > +     u64     xdp_redirect;
> > +     u64     xdp_xmit_err;
> > +     u64     xdp_tx;
> > +     u64     xdp_tx_err;
> > +};
> > +
> > +     switch (act) {
> > +     case XDP_PASS:
> > +             rxq->stats.xdp_pass++;
> 
> Since the stats are u64, and most machines using the FEC are 32 bit, you cannot
> just do an increment. Took a look at u64_stats_sync.h.
> 

As this increment is only executed under the NAPI kthread context,  is the protection still required?

> > -#define FEC_STATS_SIZE               (ARRAY_SIZE(fec_stats) * sizeof(u64))
> > +static struct fec_xdp_stat {
> > +     char name[ETH_GSTRING_LEN];
> > +     u32 count;
> > +} fec_xdp_stats[] = {
> > +     { "rx_xdp_redirect", 0 },
> > +     { "rx_xdp_pass", 0 },
> > +     { "rx_xdp_drop", 0 },
> > +     { "rx_xdp_tx", 0 },
> > +     { "rx_xdp_tx_errors", 0 },
> > +     { "tx_xdp_xmit", 0 },
> > +     { "tx_xdp_xmit_errors", 0 },
> > +};
> > +
> > +#define FEC_STATS_SIZE       ((ARRAY_SIZE(fec_stats) + \
> > +                     ARRAY_SIZE(fec_xdp_stats)) * sizeof(u64))
> 
> The page pool also has some stats. See page_pool_get_stats(),
> page_pool_ethtool_stats_get_strings() etc.
> 

Will add those stats in the next version.

Thanks,
Shenwei

>       Andrew
Shenwei Wang Sept. 29, 2022, 1:11 p.m. UTC | #8
> -----Original Message-----
> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Sent: Thursday, September 29, 2022 5:17 AM
> To: Shenwei Wang <shenwei.wang@nxp.com>; Joakim Zhang
> <qiangqing.zhang@nxp.com>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo
> Abeni <pabeni@redhat.com>
> > diff --git a/drivers/net/ethernet/freescale/fec.h
> > b/drivers/net/ethernet/freescale/fec.h
> > index b0100fe3c9e4..f7531503aa95 100644
> > --- a/drivers/net/ethernet/freescale/fec.h
> > +++ b/drivers/net/ethernet/freescale/fec.h
> > @@ -346,8 +346,10 @@ struct bufdesc_ex {
> >    * the skbuffer directly.
> >    */
> >
> > +#define FEC_ENET_XDP_HEADROOM        (512) /* XDP_PACKET_HEADROOM
> + NET_IP_ALIGN) */
> 
> Why the large headroom?
> 

The accurate value here should be "XDP_PACKET_HEADROOM (256) + NET_IP_ALIGN" which then aligns with 64 bytes. So 256 + 64 should be enough here.
 
> > +
> >   #define FEC_ENET_RX_PAGES   256
> > -#define FEC_ENET_RX_FRSIZE   2048
> > +#define FEC_ENET_RX_FRSIZE   (PAGE_SIZE - FEC_ENET_XDP_HEADROOM)
> 
> This FEC_ENET_RX_FRSIZE is likely wrong, because you also need to reserve 320
> bytes at the end for struct skb_shared_info.
> (320 calculated as SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
> 
> >   #define FEC_ENET_RX_FRPPG   (PAGE_SIZE / FEC_ENET_RX_FRSIZE)
> >   #define RX_RING_SIZE                (FEC_ENET_RX_FRPPG *
> FEC_ENET_RX_PAGES)
> >   #define FEC_ENET_TX_FRSIZE  2048
> > @@ -517,6 +519,22 @@ struct bufdesc_prop {
> [...]
> 
> > diff --git a/drivers/net/ethernet/freescale/fec_main.c
> > b/drivers/net/ethernet/freescale/fec_main.c
> > index 59921218a8a4..2e30182ed770 100644
> > --- a/drivers/net/ethernet/freescale/fec_main.c
> > +++ b/drivers/net/ethernet/freescale/fec_main.c
> > @@ -66,6 +66,8 @@
> >   #include <linux/mfd/syscon.h>
> >   #include <linux/regmap.h>
> >   #include <soc/imx/cpuidle.h>
> > +#include <linux/filter.h>
> > +#include <linux/bpf.h>
> >
> >   #include <asm/cacheflush.h>
> >
> > @@ -87,6 +89,11 @@ static const u16 fec_enet_vlan_pri_to_queue[8] = {0, 0,
> 1, 1, 1, 2, 2, 2};
> >   #define FEC_ENET_OPD_V      0xFFF0
> >   #define FEC_MDIO_PM_TIMEOUT  100 /* ms */
> >
> > +#define FEC_ENET_XDP_PASS          0
> > +#define FEC_ENET_XDP_CONSUMED      BIT(0)
> > +#define FEC_ENET_XDP_TX            BIT(1)
> > +#define FEC_ENET_XDP_REDIR         BIT(2)
> > +
> >   struct fec_devinfo {
> >       u32 quirks;
> >   };
> > @@ -422,6 +429,49 @@ fec_enet_clear_csum(struct sk_buff *skb, struct
> net_device *ndev)
> >       return 0;
> >   }
> >
> > +static int
> > +fec_enet_create_page_pool(struct fec_enet_private *fep,
> > +                       struct fec_enet_priv_rx_q *rxq, int size) {
> > +     struct bpf_prog *xdp_prog = READ_ONCE(fep->xdp_prog);
> > +     struct page_pool_params pp_params = {
> > +             .order = 0,
> > +             .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
> > +             .pool_size = size,
> > +             .nid = dev_to_node(&fep->pdev->dev),
> > +             .dev = &fep->pdev->dev,
> > +             .dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE,
> > +             .offset = FEC_ENET_XDP_HEADROOM,
> > +             .max_len = FEC_ENET_RX_FRSIZE,
> 
> XDP BPF-prog cannot access last 320 bytes, so FEC_ENET_RX_FRSIZE is wrong
> here.
> 

So the FEC_ENET_RX_FRSIZE should subtract the sizeof(struct skb_shared_info) in the definition, right? 

> > +     };
> > +     int err;
> > +
> > +     rxq->page_pool = page_pool_create(&pp_params);
> > +     if (IS_ERR(rxq->page_pool)) {
> > +             err = PTR_ERR(rxq->page_pool);
> > +             rxq->page_pool = NULL;
> > +             return err;
> > +     }
> > +
> > +     err = xdp_rxq_info_reg(&rxq->xdp_rxq, fep->netdev, rxq->id, 0);
> > +     if (err < 0)
> > +             goto err_free_pp;
> > +
> > +     err = xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq,
> MEM_TYPE_PAGE_POOL,
> > +                                      rxq->page_pool);
> > +     if (err)
> > +             goto err_unregister_rxq;
> > +
> > +     return 0;
> > +
> > +err_unregister_rxq:
> > +     xdp_rxq_info_unreg(&rxq->xdp_rxq);
> > +err_free_pp:
> > +     page_pool_destroy(rxq->page_pool);
> > +     rxq->page_pool = NULL;
> > +     return err;
> > +}
> > +
> >   static struct bufdesc *
> >   fec_enet_txq_submit_frag_skb(struct fec_enet_priv_tx_q *txq,
> >                            struct sk_buff *skb, @@ -1285,7 +1335,6 @@
> > fec_stop(struct net_device *ndev)
> >       }
> >   }
> >
> > -
> >   static void
> >   fec_timeout(struct net_device *ndev, unsigned int txqueue)
> >   {
> > @@ -1450,7 +1499,7 @@ static void fec_enet_tx(struct net_device *ndev)
> >               fec_enet_tx_queue(ndev, i);
> >   }
> >
> > -static int
> > +static int __maybe_unused
> >   fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct
> sk_buff *skb)
> >   {
> >       struct  fec_enet_private *fep = netdev_priv(ndev); @@ -1470,8
> > +1519,9 @@ fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc
> *bdp, struct sk_buff
> >       return 0;
> >   }
> >
> > -static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
> > -                            struct bufdesc *bdp, u32 length, bool swap)
> > +static bool __maybe_unused
> > +fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
> > +                struct bufdesc *bdp, u32 length, bool swap)
> >   {
> >       struct  fec_enet_private *fep = netdev_priv(ndev);
> >       struct sk_buff *new_skb;
> > @@ -1496,6 +1546,78 @@ static bool fec_enet_copybreak(struct net_device
> *ndev, struct sk_buff **skb,
> >       return true;
> >   }
> >
> > +static void fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq,
> > +                             struct bufdesc *bdp, int index) {
> > +     struct page *new_page;
> > +     dma_addr_t phys_addr;
> > +
> > +     new_page = page_pool_dev_alloc_pages(rxq->page_pool);
> > +     WARN_ON(!new_page);
> > +     rxq->rx_skb_info[index].page = new_page;
> > +
> > +     rxq->rx_skb_info[index].offset = FEC_ENET_XDP_HEADROOM;
> > +     phys_addr = page_pool_get_dma_addr(new_page) +
> FEC_ENET_XDP_HEADROOM;
> > +     bdp->cbd_bufaddr = cpu_to_fec32(phys_addr); }
> > +
> > +static u32
> > +fec_enet_run_xdp(struct fec_enet_private *fep, struct bpf_prog *prog,
> > +              struct xdp_buff *xdp, struct fec_enet_priv_rx_q *rxq,
> > +int index) {
> > +     unsigned int sync, len = xdp->data_end - xdp->data;
> > +     u32 ret = FEC_ENET_XDP_PASS;
> > +     struct page *page;
> > +     int err;
> > +     u32 act;
> > +
> > +     act = bpf_prog_run_xdp(prog, xdp);
> > +
> > +     /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
> > +     sync = xdp->data_end - xdp->data_hard_start -
> FEC_ENET_XDP_HEADROOM;
> > +     sync = max(sync, len);
> > +
> > +     switch (act) {
> > +     case XDP_PASS:
> > +             rxq->stats.xdp_pass++;
> > +             ret = FEC_ENET_XDP_PASS;
> > +             break;
> > +
> > +     case XDP_TX:
> > +             rxq->stats.xdp_tx++;
> > +             bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
> > +             fallthrough;
> 
> This fallthrough looks wrong. The next xdp_do_redirect() call will pickup left-
> overs in per CPU bpf_redirect_info.
> 

So before the XDP_TX is implemented, this part of codes should reside below the XDP_REDIRECT case?

> > +
> > +     case XDP_REDIRECT:
> > +             err = xdp_do_redirect(fep->netdev, xdp, prog);
> > +             rxq->stats.xdp_redirect++;
> > -                     dma_unmap_single(&fep->pdev->dev,
...

> > -                                      fec32_to_cpu(bdp->cbd_bufaddr),
> > -                                      FEC_ENET_RX_FRSIZE - fep->rx_align,
> > -                                      DMA_FROM_DEVICE);
> > -             }
> > -
> > -             prefetch(skb->data - NET_IP_ALIGN);
> > +             skb = build_skb(page_address(page), FEC_ENET_RX_FRSIZE);
> 
> This looks wrong, I think FEC_ENET_RX_FRSIZE should be replaced by PAGE_SIZE.
> As build_skb() does:
> 
>   size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> 

Agree. As the current FRSIZE definition did not subtract the sizeof(struct skb_shared_info), I happened to not see the problem during the testing.

Thanks,
Shenwei

> > +             skb_reserve(skb, FEC_ENET_XDP_HEADROOM);
> 
> The skb_reserve looks correct.
> 
> >               skb_put(skb, pkt_len - 4);
> >               data = skb->data;
> > +             page_pool_release_page(rxq->page_pool, page);
> 
> Today page_pool have SKB recycle support (you might have looked at drivers
> that didn't utilize this yet), thus you don't need to release the page
> (page_pool_release_page) here.  Instead you could simply mark the SKB for
> recycling, unless driver does some page refcnt tricks I didn't notice.
> 
>   skb_mark_for_recycle(skb);
> 
> 
> > -             if (!is_copybreak && need_swap)
> > +             if (need_swap)
> >                       swap_buffer(data, pkt_len);
> >
> >   #if !defined(CONFIG_M5272)
> > @@ -1649,16 +1781,6 @@ fec_enet_rx_queue(struct net_device *ndev, int
> > budget, u16 queue_id)
> [...]
Andrew Lunn Sept. 29, 2022, 1:22 p.m. UTC | #9
> I actually did some compare testing regarding the page pool for
> normal traffic.  So far I don't see significant improvement in the
> current implementation. The performance for large packets improves a
> little, and the performance for small packets get a little worse.

What hardware was this for? imx51? imx6? imx7 Vybrid? These all use
the FEC.

By small packets, do you mean those under the copybreak limit?

Please provide some benchmark numbers with your next patchset.

       Andrew
Andrew Lunn Sept. 29, 2022, 1:24 p.m. UTC | #10
> > > +struct fec_enet_xdp_stats {
> > > +     u64     xdp_pass;
> > > +     u64     xdp_drop;
> > > +     u64     xdp_xmit;
> > > +     u64     xdp_redirect;
> > > +     u64     xdp_xmit_err;
> > > +     u64     xdp_tx;
> > > +     u64     xdp_tx_err;
> > > +};
> > > +
> > > +     switch (act) {
> > > +     case XDP_PASS:
> > > +             rxq->stats.xdp_pass++;
> > 
> > Since the stats are u64, and most machines using the FEC are 32 bit, you cannot
> > just do an increment. Took a look at u64_stats_sync.h.
> > 
> 

> As this increment is only executed under the NAPI kthread context,
> is the protection still required?

Are the statistics values read by ethtool under NAPI kthread context?

    Andrew
Shenwei Wang Sept. 29, 2022, 1:26 p.m. UTC | #11
> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Thursday, September 29, 2022 8:23 AM
> To: Shenwei Wang <shenwei.wang@nxp.com>
> Cc: Joakim Zhang <qiangqing.zhang@nxp.com>; David S. Miller
> <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev
> Subject: Re: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> Caution: EXT Email
> 
> > I actually did some compare testing regarding the page pool for normal
> > traffic.  So far I don't see significant improvement in the current
> > implementation. The performance for large packets improves a little,
> > and the performance for small packets get a little worse.
> 
> What hardware was this for? imx51? imx6? imx7 Vybrid? These all use the FEC.

I tested on imx8qxp platform. It is ARM64.
 
> 
> By small packets, do you mean those under the copybreak limit?
> 
> Please provide some benchmark numbers with your next patchset.

Yes, the packet size is 64 bytes and it is under the copybreak limit. As the impact is not significant, I would prefer to remove the copybreak logic.

Thanks,
Shenwei

> 
>        Andrew
Shenwei Wang Sept. 29, 2022, 1:35 p.m. UTC | #12
> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Thursday, September 29, 2022 8:24 AM
> To: Shenwei Wang <shenwei.wang@nxp.com>
> Cc: Joakim Zhang <qiangqing.zhang@nxp.com>; David S. Miller
> <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev
> Subject: Re: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> Caution: EXT Email
> 
> > > > +struct fec_enet_xdp_stats {
> > > > +     u64     xdp_pass;
> > > > +     u64     xdp_drop;
> > > > +     u64     xdp_xmit;
> > > > +     u64     xdp_redirect;
> > > > +     u64     xdp_xmit_err;
> > > > +     u64     xdp_tx;
> > > > +     u64     xdp_tx_err;
> > > > +};
> > > > +
> > > > +     switch (act) {
> > > > +     case XDP_PASS:
> > > > +             rxq->stats.xdp_pass++;
> > >
> > > Since the stats are u64, and most machines using the FEC are 32 bit,
> > > you cannot just do an increment. Took a look at u64_stats_sync.h.
> > >
> >
> 
> > As this increment is only executed under the NAPI kthread context, is
> > the protection still required?
> 
> Are the statistics values read by ethtool under NAPI kthread context?
> 

You are right. The read is not under NAPI context.

Thanks,
Shenwei

>     Andrew
Andrew Lunn Sept. 29, 2022, 3:19 p.m. UTC | #13
> > > I actually did some compare testing regarding the page pool for normal
> > > traffic.  So far I don't see significant improvement in the current
> > > implementation. The performance for large packets improves a little,
> > > and the performance for small packets get a little worse.
> > 
> > What hardware was this for? imx51? imx6? imx7 Vybrid? These all use the FEC.
> 
> I tested on imx8qxp platform. It is ARM64.

Please also test the older platforms. imx8 is quite powerful, so some
performance loss does not matter too much. But for the older systems,
i know people has spent a lot of time and effort optimising network
performance, and they will be unhappy if you make it slower.

> > By small packets, do you mean those under the copybreak limit?
> > 
> > Please provide some benchmark numbers with your next patchset.
> 

> Yes, the packet size is 64 bytes and it is under the copybreak
> limit. As the impact is not significant, I would prefer to remove
> the copybreak logic.

Lets look at the benchmark numbers, what you actually mean by not
significant, and across a range of hardware.

	     Andrew
Jesper Dangaard Brouer Sept. 29, 2022, 3:28 p.m. UTC | #14
On 29/09/2022 15.26, Shenwei Wang wrote:
> 
>> From: Andrew Lunn <andrew@lunn.ch>
>> Sent: Thursday, September 29, 2022 8:23 AM
[...]
>>
>>> I actually did some compare testing regarding the page pool for normal
>>> traffic.  So far I don't see significant improvement in the current
>>> implementation. The performance for large packets improves a little,
>>> and the performance for small packets get a little worse.
>>
>> What hardware was this for? imx51? imx6? imx7 Vybrid? These all use the FEC.
> 
> I tested on imx8qxp platform. It is ARM64.

On mvneta driver/platform we saw huge speedup replacing:

   page_pool_release_page(rxq->page_pool, page);
with
   skb_mark_for_recycle(skb);

As I mentioned: Today page_pool have SKB recycle support (you might have 
looked at drivers that didn't utilize this yet), thus you don't need to 
release the page (page_pool_release_page) here.  Instead you could 
simply mark the SKB for recycling, unless driver does some page refcnt 
tricks I didn't notice.

On the mvneta driver/platform the DMA unmap (in page_pool_release_page) 
was very expensive. This imx8qxp platform might have faster DMA unmap in 
case is it cache-coherent.

I would be very interested in knowing if skb_mark_for_recycle() helps on 
this platform, for normal network stack performance.

>> By small packets, do you mean those under the copybreak limit?
>>
>> Please provide some benchmark numbers with your next patchset.
> 
> Yes, the packet size is 64 bytes and it is under the copybreak limit.
> As the impact is not significant, I would prefer to remove the
> copybreak  logic.

+1 to removing this logic if possible, due to maintenance cost.

--Jesper
Andrew Lunn Sept. 29, 2022, 3:39 p.m. UTC | #15
On Thu, Sep 29, 2022 at 05:28:43PM +0200, Jesper Dangaard Brouer wrote:
> 
> 
> On 29/09/2022 15.26, Shenwei Wang wrote:
> > 
> > > From: Andrew Lunn <andrew@lunn.ch>
> > > Sent: Thursday, September 29, 2022 8:23 AM
> [...]
> > > 
> > > > I actually did some compare testing regarding the page pool for normal
> > > > traffic.  So far I don't see significant improvement in the current
> > > > implementation. The performance for large packets improves a little,
> > > > and the performance for small packets get a little worse.
> > > 
> > > What hardware was this for? imx51? imx6? imx7 Vybrid? These all use the FEC.
> > 
> > I tested on imx8qxp platform. It is ARM64.
> 
> On mvneta driver/platform we saw huge speedup replacing:
> 
>   page_pool_release_page(rxq->page_pool, page);
> with
>   skb_mark_for_recycle(skb);
> 
> As I mentioned: Today page_pool have SKB recycle support (you might have
> looked at drivers that didn't utilize this yet), thus you don't need to
> release the page (page_pool_release_page) here.  Instead you could simply
> mark the SKB for recycling, unless driver does some page refcnt tricks I
> didn't notice.
> 
> On the mvneta driver/platform the DMA unmap (in page_pool_release_page) was
> very expensive. This imx8qxp platform might have faster DMA unmap in case is
> it cache-coherent.

I don't know about imx8qxp, but i've played with imx6 and Vybrid, and
cache flush and invalidate are very expensive.

      Andrew
Jesper Dangaard Brouer Sept. 29, 2022, 3:44 p.m. UTC | #16
On 29/09/2022 15.11, Shenwei Wang wrote:
> 
>> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
 >>
>>> diff --git a/drivers/net/ethernet/freescale/fec.h
>>> b/drivers/net/ethernet/freescale/fec.h
>>> index b0100fe3c9e4..f7531503aa95 100644
>>> --- a/drivers/net/ethernet/freescale/fec.h
>>> +++ b/drivers/net/ethernet/freescale/fec.h
>>> @@ -346,8 +346,10 @@ struct bufdesc_ex {
>>>     * the skbuffer directly.
>>>     */
>>>
>>> +#define FEC_ENET_XDP_HEADROOM        (512) /* XDP_PACKET_HEADROOM
>> + NET_IP_ALIGN) */
>>
>> Why the large headroom?
>>
> 
> The accurate value here should be "XDP_PACKET_HEADROOM (256) +
> NET_IP_ALIGN" which then aligns with 64 bytes. So 256 + 64 should be
> enough here.
> 

Most other XDP drivers have 256 bytes headroom.
I don't understand why you just don't keep this at 256, like other drivers ?

>>> +
>>>    #define FEC_ENET_RX_PAGES   256
>>> -#define FEC_ENET_RX_FRSIZE   2048
>>> +#define FEC_ENET_RX_FRSIZE   (PAGE_SIZE - FEC_ENET_XDP_HEADROOM)
>>
>> This FEC_ENET_RX_FRSIZE is likely wrong, because you also need to reserve 320
>> bytes at the end for struct skb_shared_info.
>> (320 calculated as SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
>>
>>>    #define FEC_ENET_RX_FRPPG   (PAGE_SIZE / FEC_ENET_RX_FRSIZE)
>>>    #define RX_RING_SIZE                (FEC_ENET_RX_FRPPG *
>> FEC_ENET_RX_PAGES)
>>>    #define FEC_ENET_TX_FRSIZE  2048
>>> @@ -517,6 +519,22 @@ struct bufdesc_prop {
>> [...]
>>
>>> diff --git a/drivers/net/ethernet/freescale/fec_main.c
>>> b/drivers/net/ethernet/freescale/fec_main.c
>>> index 59921218a8a4..2e30182ed770 100644
>>> --- a/drivers/net/ethernet/freescale/fec_main.c
>>> +++ b/drivers/net/ethernet/freescale/fec_main.c
>>> @@ -66,6 +66,8 @@
>>>    #include <linux/mfd/syscon.h>
>>>    #include <linux/regmap.h>
>>>    #include <soc/imx/cpuidle.h>
>>> +#include <linux/filter.h>
>>> +#include <linux/bpf.h>
>>>
>>>    #include <asm/cacheflush.h>
>>>
>>> @@ -87,6 +89,11 @@ static const u16 fec_enet_vlan_pri_to_queue[8] = {0, 0,
>> 1, 1, 1, 2, 2, 2};
>>>    #define FEC_ENET_OPD_V      0xFFF0
>>>    #define FEC_MDIO_PM_TIMEOUT  100 /* ms */
>>>
>>> +#define FEC_ENET_XDP_PASS          0
>>> +#define FEC_ENET_XDP_CONSUMED      BIT(0)
>>> +#define FEC_ENET_XDP_TX            BIT(1)
>>> +#define FEC_ENET_XDP_REDIR         BIT(2)
>>> +
>>>    struct fec_devinfo {
>>>        u32 quirks;
>>>    };
>>> @@ -422,6 +429,49 @@ fec_enet_clear_csum(struct sk_buff *skb, struct
>> net_device *ndev)
>>>        return 0;
>>>    }
>>>
>>> +static int
>>> +fec_enet_create_page_pool(struct fec_enet_private *fep,
>>> +                       struct fec_enet_priv_rx_q *rxq, int size) {
>>> +     struct bpf_prog *xdp_prog = READ_ONCE(fep->xdp_prog);
>>> +     struct page_pool_params pp_params = {
>>> +             .order = 0,
>>> +             .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
>>> +             .pool_size = size,
>>> +             .nid = dev_to_node(&fep->pdev->dev),
>>> +             .dev = &fep->pdev->dev,
>>> +             .dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE,
>>> +             .offset = FEC_ENET_XDP_HEADROOM,
>>> +             .max_len = FEC_ENET_RX_FRSIZE,
>>
>> XDP BPF-prog cannot access last 320 bytes, so FEC_ENET_RX_FRSIZE is wrong
>> here.
>>
> 
> So the FEC_ENET_RX_FRSIZE should subtract the sizeof(struct
> skb_shared_info) in the definition, right?
> 

Yes correct, but use:
   SKB_DATA_ALIGN(sizeof(struct skb_shared_info))


>>> +     };
>>> +     int err;
>>> +
>>> +     rxq->page_pool = page_pool_create(&pp_params);
>>> +     if (IS_ERR(rxq->page_pool)) {
>>> +             err = PTR_ERR(rxq->page_pool);
>>> +             rxq->page_pool = NULL;
>>> +             return err;
>>> +     }
>>> +
>>> +     err = xdp_rxq_info_reg(&rxq->xdp_rxq, fep->netdev, rxq->id, 0);
>>> +     if (err < 0)
>>> +             goto err_free_pp;
>>> +
>>> +     err = xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, MEM_TYPE_PAGE_POOL,
>>> +                                      rxq->page_pool);
>>> +     if (err)
>>> +             goto err_unregister_rxq;
>>> +
>>> +     return 0;
>>> +
>>> +err_unregister_rxq:
>>> +     xdp_rxq_info_unreg(&rxq->xdp_rxq);
>>> +err_free_pp:
>>> +     page_pool_destroy(rxq->page_pool);
>>> +     rxq->page_pool = NULL;
>>> +     return err;
>>> +}
>>> +
>>>    static struct bufdesc *
>>>    fec_enet_txq_submit_frag_skb(struct fec_enet_priv_tx_q *txq,
>>>                             struct sk_buff *skb, @@ -1285,7 +1335,6 @@
>>> fec_stop(struct net_device *ndev)
>>>        }
>>>    }
>>>
>>> -
>>>    static void
>>>    fec_timeout(struct net_device *ndev, unsigned int txqueue)
>>>    {
>>> @@ -1450,7 +1499,7 @@ static void fec_enet_tx(struct net_device *ndev)
>>>                fec_enet_tx_queue(ndev, i);
>>>    }
>>>
>>> -static int
>>> +static int __maybe_unused
>>>    fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff *skb)
>>>    {
>>>        struct  fec_enet_private *fep = netdev_priv(ndev); @@ -1470,8
>>> +1519,9 @@ fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc
>> *bdp, struct sk_buff
>>>        return 0;
>>>    }
>>>
>>> -static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
>>> -                            struct bufdesc *bdp, u32 length, bool swap)
>>> +static bool __maybe_unused
>>> +fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
>>> +                struct bufdesc *bdp, u32 length, bool swap)
>>>    {
>>>        struct  fec_enet_private *fep = netdev_priv(ndev);
>>>        struct sk_buff *new_skb;
>>> @@ -1496,6 +1546,78 @@ static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
>>>        return true;
>>>    }
>>>
>>> +static void fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq,
>>> +                             struct bufdesc *bdp, int index) {
>>> +     struct page *new_page;
>>> +     dma_addr_t phys_addr;
>>> +
>>> +     new_page = page_pool_dev_alloc_pages(rxq->page_pool);
>>> +     WARN_ON(!new_page);
>>> +     rxq->rx_skb_info[index].page = new_page;
>>> +
>>> +     rxq->rx_skb_info[index].offset = FEC_ENET_XDP_HEADROOM;
>>> +     phys_addr = page_pool_get_dma_addr(new_page) + FEC_ENET_XDP_HEADROOM;
>>> +     bdp->cbd_bufaddr = cpu_to_fec32(phys_addr); }
>>> +
>>> +static u32
>>> +fec_enet_run_xdp(struct fec_enet_private *fep, struct bpf_prog *prog,
>>> +              struct xdp_buff *xdp, struct fec_enet_priv_rx_q *rxq,
>>> +int index) {
>>> +     unsigned int sync, len = xdp->data_end - xdp->data;
>>> +     u32 ret = FEC_ENET_XDP_PASS;
>>> +     struct page *page;
>>> +     int err;
>>> +     u32 act;
>>> +
>>> +     act = bpf_prog_run_xdp(prog, xdp);
>>> +
>>> +     /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
>>> +     sync = xdp->data_end - xdp->data_hard_start - FEC_ENET_XDP_HEADROOM;
>>> +     sync = max(sync, len);
>>> +
>>> +     switch (act) {
>>> +     case XDP_PASS:
>>> +             rxq->stats.xdp_pass++;
>>> +             ret = FEC_ENET_XDP_PASS;
>>> +             break;
>>> +
>>> +     case XDP_TX:
>>> +             rxq->stats.xdp_tx++;
>>> +             bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
>>> +             fallthrough;
>>
>> This fallthrough looks wrong. The next xdp_do_redirect() call will pickup left-
>> overs in per CPU bpf_redirect_info.
>>
> 
> So before the XDP_TX is implemented, this part of codes should reside below the XDP_REDIRECT case?
> 

If that fallthrough goes to dropping packet, then yes.

>>> +
>>> +     case XDP_REDIRECT:
>>> +             err = xdp_do_redirect(fep->netdev, xdp, prog);
>>> +             rxq->stats.xdp_redirect++;
>>> -                     dma_unmap_single(&fep->pdev->dev,
> ...
> 
>>> -                                      fec32_to_cpu(bdp->cbd_bufaddr),
>>> -                                      FEC_ENET_RX_FRSIZE - fep->rx_align,
>>> -                                      DMA_FROM_DEVICE);
>>> -             }
>>> -
>>> -             prefetch(skb->data - NET_IP_ALIGN);
>>> +             skb = build_skb(page_address(page), FEC_ENET_RX_FRSIZE);
>>
>> This looks wrong, I think FEC_ENET_RX_FRSIZE should be replaced by PAGE_SIZE.
>> As build_skb() does:
>>
>>    size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>>
> 
> Agree. As the current FRSIZE definition did not subtract the sizeof(struct skb_shared_info), I happened to not see the problem during the testing.
> 

As I wrote use PAGE_SIZE here.


>>> +             skb_reserve(skb, FEC_ENET_XDP_HEADROOM);
>>
>> The skb_reserve looks correct.
>>
>>>                skb_put(skb, pkt_len - 4);
>>>                data = skb->data;
>>> +             page_pool_release_page(rxq->page_pool, page);
>>
>> Today page_pool have SKB recycle support (you might have looked at drivers
>> that didn't utilize this yet), thus you don't need to release the page
>> (page_pool_release_page) here.  Instead you could simply mark the SKB for
>> recycling, unless driver does some page refcnt tricks I didn't notice.
>>
>>    skb_mark_for_recycle(skb);

I hope you try out the above proposed change.

>>
>>> -             if (!is_copybreak && need_swap)
>>> +             if (need_swap)
>>>                        swap_buffer(data, pkt_len);
>>>
>>>    #if !defined(CONFIG_M5272)
>>> @@ -1649,16 +1781,6 @@ fec_enet_rx_queue(struct net_device *ndev, int
>>> budget, u16 queue_id)
>> [...]
>
Shenwei Wang Sept. 29, 2022, 3:52 p.m. UTC | #17
> -----Original Message-----
> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Sent: Thursday, September 29, 2022 10:29 AM
> To: Shenwei Wang <shenwei.wang@nxp.com>; Andrew Lunn <andrew@lunn.ch>
> Cc: brouer@redhat.com; Joakim Zhang <qiangqing.zhang@nxp.com>; David S.
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev
> Subject: Re: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> Caution: EXT Email
> 
> On 29/09/2022 15.26, Shenwei Wang wrote:
> >
> >> From: Andrew Lunn <andrew@lunn.ch>
> >> Sent: Thursday, September 29, 2022 8:23 AM
> [...]
> >>
> >>> I actually did some compare testing regarding the page pool for
> >>> normal traffic.  So far I don't see significant improvement in the
> >>> current implementation. The performance for large packets improves a
> >>> little, and the performance for small packets get a little worse.
> >>
> >> What hardware was this for? imx51? imx6? imx7 Vybrid? These all use the
> FEC.
> >
> > I tested on imx8qxp platform. It is ARM64.
> 
> On mvneta driver/platform we saw huge speedup replacing:
> 
>    page_pool_release_page(rxq->page_pool, page); with
>    skb_mark_for_recycle(skb);
> 
> As I mentioned: Today page_pool have SKB recycle support (you might have
> looked at drivers that didn't utilize this yet), thus you don't need to release the
> page (page_pool_release_page) here.  Instead you could simply mark the SKB
> for recycling, unless driver does some page refcnt tricks I didn't notice.
> 
> On the mvneta driver/platform the DMA unmap (in page_pool_release_page)
> was very expensive. This imx8qxp platform might have faster DMA unmap in
> case is it cache-coherent.
> 
> I would be very interested in knowing if skb_mark_for_recycle() helps on this
> platform, for normal network stack performance.
> 

Did a quick compare testing for the following 3 scenarios: 
1. original implementation 

shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 49154 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec   104 MBytes   868 Mbits/sec
[  1] 1.0000-2.0000 sec   105 MBytes   878 Mbits/sec
[  1] 2.0000-3.0000 sec   105 MBytes   881 Mbits/sec
[  1] 3.0000-4.0000 sec   105 MBytes   879 Mbits/sec
[  1] 4.0000-5.0000 sec   105 MBytes   878 Mbits/sec
[  1] 5.0000-6.0000 sec   105 MBytes   878 Mbits/sec
[  1] 6.0000-7.0000 sec   104 MBytes   875 Mbits/sec
[  1] 7.0000-8.0000 sec   104 MBytes   875 Mbits/sec
[  1] 8.0000-9.0000 sec   104 MBytes   873 Mbits/sec
[  1] 9.0000-10.0000 sec   104 MBytes   875 Mbits/sec
[  1] 0.0000-10.0073 sec  1.02 GBytes   875 Mbits/sec

2. Page pool with page_pool_release_page

shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 35924 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec   101 MBytes   849 Mbits/sec
[  1] 1.0000-2.0000 sec   102 MBytes   860 Mbits/sec
[  1] 2.0000-3.0000 sec   102 MBytes   860 Mbits/sec
[  1] 3.0000-4.0000 sec   102 MBytes   859 Mbits/sec
[  1] 4.0000-5.0000 sec   103 MBytes   863 Mbits/sec
[  1] 5.0000-6.0000 sec   103 MBytes   864 Mbits/sec
[  1] 6.0000-7.0000 sec   103 MBytes   863 Mbits/sec
[  1] 7.0000-8.0000 sec   103 MBytes   865 Mbits/sec
[  1] 8.0000-9.0000 sec   103 MBytes   862 Mbits/sec
[  1] 9.0000-10.0000 sec   102 MBytes   856 Mbits/sec
[  1] 0.0000-10.0246 sec  1.00 GBytes   858 Mbits/sec


3. page pool with skb_mark_for_recycle

shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 42724 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec   111 MBytes   931 Mbits/sec
[  1] 1.0000-2.0000 sec   112 MBytes   935 Mbits/sec
[  1] 2.0000-3.0000 sec   111 MBytes   934 Mbits/sec
[  1] 3.0000-4.0000 sec   111 MBytes   934 Mbits/sec
[  1] 4.0000-5.0000 sec   111 MBytes   934 Mbits/sec
[  1] 5.0000-6.0000 sec   112 MBytes   935 Mbits/sec
[  1] 6.0000-7.0000 sec   111 MBytes   934 Mbits/sec
[  1] 7.0000-8.0000 sec   111 MBytes   933 Mbits/sec
[  1] 8.0000-9.0000 sec   112 MBytes   935 Mbits/sec
[  1] 9.0000-10.0000 sec   111 MBytes   933 Mbits/sec
[  1] 0.0000-10.0069 sec  1.09 GBytes   934 Mbits/sec

For small packet size (64 bytes), all three cases have almost the same result:

shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1 -l 64
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 58204 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec  36.9 MBytes   309 Mbits/sec
[  1] 1.0000-2.0000 sec  36.6 MBytes   307 Mbits/sec
[  1] 2.0000-3.0000 sec  36.6 MBytes   307 Mbits/sec
[  1] 3.0000-4.0000 sec  36.5 MBytes   307 Mbits/sec
[  1] 4.0000-5.0000 sec  37.1 MBytes   311 Mbits/sec
[  1] 5.0000-6.0000 sec  37.2 MBytes   312 Mbits/sec
[  1] 6.0000-7.0000 sec  37.1 MBytes   311 Mbits/sec
[  1] 7.0000-8.0000 sec  37.1 MBytes   311 Mbits/sec
[  1] 8.0000-9.0000 sec  37.1 MBytes   312 Mbits/sec
[  1] 9.0000-10.0000 sec  37.2 MBytes   312 Mbits/sec
[  1] 0.0000-10.0097 sec   369 MBytes   310 Mbits/sec

Regards,
Shenwei


> >> By small packets, do you mean those under the copybreak limit?
> >>
> >> Please provide some benchmark numbers with your next patchset.
> >
> > Yes, the packet size is 64 bytes and it is under the copybreak limit.
> > As the impact is not significant, I would prefer to remove the
> > copybreak  logic.
> 
> +1 to removing this logic if possible, due to maintenance cost.
> 
> --Jesper
Jesper Dangaard Brouer Sept. 29, 2022, 6:55 p.m. UTC | #18
On 29/09/2022 17.52, Shenwei Wang wrote:
> 
>> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
>>
>> On 29/09/2022 15.26, Shenwei Wang wrote:
>>>
>>>> From: Andrew Lunn <andrew@lunn.ch>
>>>> Sent: Thursday, September 29, 2022 8:23 AM
>> [...]
>>>>
>>>>> I actually did some compare testing regarding the page pool for
>>>>> normal traffic.  So far I don't see significant improvement in the
>>>>> current implementation. The performance for large packets improves a
>>>>> little, and the performance for small packets get a little worse.
>>>>
>>>> What hardware was this for? imx51? imx6? imx7 Vybrid? These all use the FEC.
>>>
>>> I tested on imx8qxp platform. It is ARM64.
>>
>> On mvneta driver/platform we saw huge speedup replacing:
>>
>>     page_pool_release_page(rxq->page_pool, page); with
>>     skb_mark_for_recycle(skb);
>>
>> As I mentioned: Today page_pool have SKB recycle support (you might have
>> looked at drivers that didn't utilize this yet), thus you don't need to release the
>> page (page_pool_release_page) here.  Instead you could simply mark the SKB
>> for recycling, unless driver does some page refcnt tricks I didn't notice.
>>
>> On the mvneta driver/platform the DMA unmap (in page_pool_release_page)
>> was very expensive. This imx8qxp platform might have faster DMA unmap in
>> case is it cache-coherent.
>>
>> I would be very interested in knowing if skb_mark_for_recycle() helps on this
>> platform, for normal network stack performance.
>>
> 
> Did a quick compare testing for the following 3 scenarios:

Thanks for doing this! :-)

> 1. original implementation
> 
> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> ------------------------------------------------------------
> Client connecting to 10.81.16.245, TCP port 5001
> TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
> ------------------------------------------------------------
> [  1] local 10.81.17.20 port 49154 connected with 10.81.16.245 port 5001
> [ ID] Interval       Transfer     Bandwidth
> [  1] 0.0000-1.0000 sec   104 MBytes   868 Mbits/sec
> [  1] 1.0000-2.0000 sec   105 MBytes   878 Mbits/sec
> [  1] 2.0000-3.0000 sec   105 MBytes   881 Mbits/sec
> [  1] 3.0000-4.0000 sec   105 MBytes   879 Mbits/sec
> [  1] 4.0000-5.0000 sec   105 MBytes   878 Mbits/sec
> [  1] 5.0000-6.0000 sec   105 MBytes   878 Mbits/sec
> [  1] 6.0000-7.0000 sec   104 MBytes   875 Mbits/sec
> [  1] 7.0000-8.0000 sec   104 MBytes   875 Mbits/sec
> [  1] 8.0000-9.0000 sec   104 MBytes   873 Mbits/sec
> [  1] 9.0000-10.0000 sec   104 MBytes   875 Mbits/sec
> [  1] 0.0000-10.0073 sec  1.02 GBytes   875 Mbits/sec
> 
> 2. Page pool with page_pool_release_page
> 
> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> ------------------------------------------------------------
> Client connecting to 10.81.16.245, TCP port 5001
> TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
> ------------------------------------------------------------
> [  1] local 10.81.17.20 port 35924 connected with 10.81.16.245 port 5001
> [ ID] Interval       Transfer     Bandwidth
> [  1] 0.0000-1.0000 sec   101 MBytes   849 Mbits/sec
> [  1] 1.0000-2.0000 sec   102 MBytes   860 Mbits/sec
> [  1] 2.0000-3.0000 sec   102 MBytes   860 Mbits/sec
> [  1] 3.0000-4.0000 sec   102 MBytes   859 Mbits/sec
> [  1] 4.0000-5.0000 sec   103 MBytes   863 Mbits/sec
> [  1] 5.0000-6.0000 sec   103 MBytes   864 Mbits/sec
> [  1] 6.0000-7.0000 sec   103 MBytes   863 Mbits/sec
> [  1] 7.0000-8.0000 sec   103 MBytes   865 Mbits/sec
> [  1] 8.0000-9.0000 sec   103 MBytes   862 Mbits/sec
> [  1] 9.0000-10.0000 sec   102 MBytes   856 Mbits/sec
> [  1] 0.0000-10.0246 sec  1.00 GBytes   858 Mbits/sec
> 
> 
> 3. page pool with skb_mark_for_recycle
> 
> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> ------------------------------------------------------------
> Client connecting to 10.81.16.245, TCP port 5001
> TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
> ------------------------------------------------------------
> [  1] local 10.81.17.20 port 42724 connected with 10.81.16.245 port 5001
> [ ID] Interval       Transfer     Bandwidth
> [  1] 0.0000-1.0000 sec   111 MBytes   931 Mbits/sec
> [  1] 1.0000-2.0000 sec   112 MBytes   935 Mbits/sec
> [  1] 2.0000-3.0000 sec   111 MBytes   934 Mbits/sec
> [  1] 3.0000-4.0000 sec   111 MBytes   934 Mbits/sec
> [  1] 4.0000-5.0000 sec   111 MBytes   934 Mbits/sec
> [  1] 5.0000-6.0000 sec   112 MBytes   935 Mbits/sec
> [  1] 6.0000-7.0000 sec   111 MBytes   934 Mbits/sec
> [  1] 7.0000-8.0000 sec   111 MBytes   933 Mbits/sec
> [  1] 8.0000-9.0000 sec   112 MBytes   935 Mbits/sec
> [  1] 9.0000-10.0000 sec   111 MBytes   933 Mbits/sec
> [  1] 0.0000-10.0069 sec  1.09 GBytes   934 Mbits/sec

This is a very significant performance improvement (page pool with
skb_mark_for_recycle).  This is very close to the max goodput for a
1Gbit/s link.


> For small packet size (64 bytes), all three cases have almost the same result:
> 

To me this indicate, that the DMA map/unmap operations on this platform
are indeed more expensive on larger packets.  Given this is what
page_pool does, keeping the DMA mapping intact when recycling.

Driver still need DMA-sync, although I notice you set page_pool feature
flag PP_FLAG_DMA_SYNC_DEV, this is good as page_pool will try to reduce
sync size where possible. E.g. in this SKB case will reduce the DMA-sync
to the max_len=FEC_ENET_RX_FRSIZE which should also help on performance.


> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1 -l 64
> ------------------------------------------------------------
> Client connecting to 10.81.16.245, TCP port 5001
> TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
> ------------------------------------------------------------
> [  1] local 10.81.17.20 port 58204 connected with 10.81.16.245 port 5001
> [ ID] Interval       Transfer     Bandwidth
> [  1] 0.0000-1.0000 sec  36.9 MBytes   309 Mbits/sec
> [  1] 1.0000-2.0000 sec  36.6 MBytes   307 Mbits/sec
> [  1] 2.0000-3.0000 sec  36.6 MBytes   307 Mbits/sec
> [  1] 3.0000-4.0000 sec  36.5 MBytes   307 Mbits/sec
> [  1] 4.0000-5.0000 sec  37.1 MBytes   311 Mbits/sec
> [  1] 5.0000-6.0000 sec  37.2 MBytes   312 Mbits/sec
> [  1] 6.0000-7.0000 sec  37.1 MBytes   311 Mbits/sec
> [  1] 7.0000-8.0000 sec  37.1 MBytes   311 Mbits/sec
> [  1] 8.0000-9.0000 sec  37.1 MBytes   312 Mbits/sec
> [  1] 9.0000-10.0000 sec  37.2 MBytes   312 Mbits/sec
> [  1] 0.0000-10.0097 sec   369 MBytes   310 Mbits/sec
> 
> Regards,
> Shenwei
> 
> 
>>>> By small packets, do you mean those under the copybreak limit?
>>>>
>>>> Please provide some benchmark numbers with your next patchset.
>>>
>>> Yes, the packet size is 64 bytes and it is under the copybreak limit.
>>> As the impact is not significant, I would prefer to remove the
>>> copybreak  logic.
>>
>> +1 to removing this logic if possible, due to maintenance cost.
>>
>> --Jesper
>
kernel test robot Oct. 3, 2022, 5:41 a.m. UTC | #19
Hi Shenwei,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]
[also build test ERROR on next-20220930]
[cannot apply to net/master linus/master v6.0]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Shenwei-Wang/net-fec-add-initial-XDP-support/20220928-232917
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git b9a5cbf8ba24e88071a97a51a09ef5cdf0d1f6a1
config: m68k-m5272c3_defconfig
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/0ef976d16b5d6993db5c83926eaa836fc91a6450
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Shenwei-Wang/net-fec-add-initial-XDP-support/20220928-232917
        git checkout 0ef976d16b5d6993db5c83926eaa836fc91a6450
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   m68k-linux-ld: drivers/net/ethernet/freescale/fec_main.o: in function `fec_enet_alloc_buffers':
>> fec_main.c:(.text+0x1c06): undefined reference to `page_pool_alloc_pages'
>> m68k-linux-ld: fec_main.c:(.text+0x1c6a): undefined reference to `page_pool_create'
   m68k-linux-ld: drivers/net/ethernet/freescale/fec_main.o: in function `fec_enet_rx_queue':
   fec_main.c:(.text+0x3e76): undefined reference to `page_pool_alloc_pages'
Shenwei Wang Oct. 3, 2022, 12:49 p.m. UTC | #20
Hi Jesper,

> >> On mvneta driver/platform we saw huge speedup replacing:
> >>
> >>     page_pool_release_page(rxq->page_pool, page); with
> >>     skb_mark_for_recycle(skb);
> >>

After replacing the page_pool_release_page with the skb_mark_for_recycle, I found something confused me a little in the testing result.
I tested with the sample app of "xdpsock" under two modes:  1. Native (xdpsock -i eth0). 2. Skb-mode (xdpsock -S -i eth0). 
The following are the testing result:
			With page_pool_release_page  (pps)                                           With skb_mark_for_recycle (pps)

   SKB-Mode                                  90K                                                                                                     200K       
   Native                                         190K                                                                                                   190K

The skb_mark_for_recycle solution boosted the performance of SKB-Mode to 200K+ PPS. That is even higher than the 
performance of Native solution.  Is this result reasonable? Do you have any clue why the SKB-Mode performance can 
go higher than that of Native one?

Thanks,
Shenwei



> -----Original Message-----
> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Sent: Thursday, September 29, 2022 1:55 PM
> To: Shenwei Wang <shenwei.wang@nxp.com>; Jesper Dangaard Brouer
> <jbrouer@redhat.com>; Andrew Lunn <andrew@lunn.ch>
> Cc: brouer@redhat.com; Joakim Zhang <qiangqing.zhang@nxp.com>; David S.
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev
> Subject: Re: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> Caution: EXT Email
> 
> On 29/09/2022 17.52, Shenwei Wang wrote:
> >
> >> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> >>
> >> On 29/09/2022 15.26, Shenwei Wang wrote:
> >>>
> >>>> From: Andrew Lunn <andrew@lunn.ch>
> >>>> Sent: Thursday, September 29, 2022 8:23 AM
> >> [...]
> >>>>
> >>>>> I actually did some compare testing regarding the page pool for
> >>>>> normal traffic.  So far I don't see significant improvement in the
> >>>>> current implementation. The performance for large packets improves
> >>>>> a little, and the performance for small packets get a little worse.
> >>>>
> >>>> What hardware was this for? imx51? imx6? imx7 Vybrid? These all use the
> FEC.
> >>>
> >>> I tested on imx8qxp platform. It is ARM64.
> >>
> >> On mvneta driver/platform we saw huge speedup replacing:
> >>
> >>     page_pool_release_page(rxq->page_pool, page); with
> >>     skb_mark_for_recycle(skb);
> >>
> >> As I mentioned: Today page_pool have SKB recycle support (you might
> >> have looked at drivers that didn't utilize this yet), thus you don't
> >> need to release the page (page_pool_release_page) here.  Instead you
> >> could simply mark the SKB for recycling, unless driver does some page refcnt
> tricks I didn't notice.
> >>
> >> On the mvneta driver/platform the DMA unmap (in
> >> page_pool_release_page) was very expensive. This imx8qxp platform
> >> might have faster DMA unmap in case is it cache-coherent.
> >>
> >> I would be very interested in knowing if skb_mark_for_recycle() helps
> >> on this platform, for normal network stack performance.
> >>
> >
> > Did a quick compare testing for the following 3 scenarios:
> 
> Thanks for doing this! :-)
> 
> > 1. original implementation
> >
> > shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> > ------------------------------------------------------------
> > Client connecting to 10.81.16.245, TCP port 5001 TCP window size:  416
> > KByte (WARNING: requested 1.91 MByte)
> > ------------------------------------------------------------
> > [  1] local 10.81.17.20 port 49154 connected with 10.81.16.245 port 5001
> > [ ID] Interval       Transfer     Bandwidth
> > [  1] 0.0000-1.0000 sec   104 MBytes   868 Mbits/sec
> > [  1] 1.0000-2.0000 sec   105 MBytes   878 Mbits/sec
> > [  1] 2.0000-3.0000 sec   105 MBytes   881 Mbits/sec
> > [  1] 3.0000-4.0000 sec   105 MBytes   879 Mbits/sec
> > [  1] 4.0000-5.0000 sec   105 MBytes   878 Mbits/sec
> > [  1] 5.0000-6.0000 sec   105 MBytes   878 Mbits/sec
> > [  1] 6.0000-7.0000 sec   104 MBytes   875 Mbits/sec
> > [  1] 7.0000-8.0000 sec   104 MBytes   875 Mbits/sec
> > [  1] 8.0000-9.0000 sec   104 MBytes   873 Mbits/sec
> > [  1] 9.0000-10.0000 sec   104 MBytes   875 Mbits/sec
> > [  1] 0.0000-10.0073 sec  1.02 GBytes   875 Mbits/sec
> >
> > 2. Page pool with page_pool_release_page
> >
> > shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> > ------------------------------------------------------------
> > Client connecting to 10.81.16.245, TCP port 5001 TCP window size:  416
> > KByte (WARNING: requested 1.91 MByte)
> > ------------------------------------------------------------
> > [  1] local 10.81.17.20 port 35924 connected with 10.81.16.245 port 5001
> > [ ID] Interval       Transfer     Bandwidth
> > [  1] 0.0000-1.0000 sec   101 MBytes   849 Mbits/sec
> > [  1] 1.0000-2.0000 sec   102 MBytes   860 Mbits/sec
> > [  1] 2.0000-3.0000 sec   102 MBytes   860 Mbits/sec
> > [  1] 3.0000-4.0000 sec   102 MBytes   859 Mbits/sec
> > [  1] 4.0000-5.0000 sec   103 MBytes   863 Mbits/sec
> > [  1] 5.0000-6.0000 sec   103 MBytes   864 Mbits/sec
> > [  1] 6.0000-7.0000 sec   103 MBytes   863 Mbits/sec
> > [  1] 7.0000-8.0000 sec   103 MBytes   865 Mbits/sec
> > [  1] 8.0000-9.0000 sec   103 MBytes   862 Mbits/sec
> > [  1] 9.0000-10.0000 sec   102 MBytes   856 Mbits/sec
> > [  1] 0.0000-10.0246 sec  1.00 GBytes   858 Mbits/sec
> >
> >
> > 3. page pool with skb_mark_for_recycle
> >
> > shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> > ------------------------------------------------------------
> > Client connecting to 10.81.16.245, TCP port 5001 TCP window size:  416
> > KByte (WARNING: requested 1.91 MByte)
> > ------------------------------------------------------------
> > [  1] local 10.81.17.20 port 42724 connected with 10.81.16.245 port 5001
> > [ ID] Interval       Transfer     Bandwidth
> > [  1] 0.0000-1.0000 sec   111 MBytes   931 Mbits/sec
> > [  1] 1.0000-2.0000 sec   112 MBytes   935 Mbits/sec
> > [  1] 2.0000-3.0000 sec   111 MBytes   934 Mbits/sec
> > [  1] 3.0000-4.0000 sec   111 MBytes   934 Mbits/sec
> > [  1] 4.0000-5.0000 sec   111 MBytes   934 Mbits/sec
> > [  1] 5.0000-6.0000 sec   112 MBytes   935 Mbits/sec
> > [  1] 6.0000-7.0000 sec   111 MBytes   934 Mbits/sec
> > [  1] 7.0000-8.0000 sec   111 MBytes   933 Mbits/sec
> > [  1] 8.0000-9.0000 sec   112 MBytes   935 Mbits/sec
> > [  1] 9.0000-10.0000 sec   111 MBytes   933 Mbits/sec
> > [  1] 0.0000-10.0069 sec  1.09 GBytes   934 Mbits/sec
> 
> This is a very significant performance improvement (page pool with
> skb_mark_for_recycle).  This is very close to the max goodput for a 1Gbit/s link.
> 
> 
> > For small packet size (64 bytes), all three cases have almost the same result:
> >
> 
> To me this indicate, that the DMA map/unmap operations on this platform are
> indeed more expensive on larger packets.  Given this is what page_pool does,
> keeping the DMA mapping intact when recycling.
> 
> Driver still need DMA-sync, although I notice you set page_pool feature flag
> PP_FLAG_DMA_SYNC_DEV, this is good as page_pool will try to reduce sync size
> where possible. E.g. in this SKB case will reduce the DMA-sync to the
> max_len=FEC_ENET_RX_FRSIZE which should also help on performance.
> 
> 
> > shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1 -l 64
> > ------------------------------------------------------------
> > Client connecting to 10.81.16.245, TCP port 5001 TCP window size:  416
> > KByte (WARNING: requested 1.91 MByte)
> > ------------------------------------------------------------
> > [  1] local 10.81.17.20 port 58204 connected with 10.81.16.245 port 5001
> > [ ID] Interval       Transfer     Bandwidth
> > [  1] 0.0000-1.0000 sec  36.9 MBytes   309 Mbits/sec
> > [  1] 1.0000-2.0000 sec  36.6 MBytes   307 Mbits/sec
> > [  1] 2.0000-3.0000 sec  36.6 MBytes   307 Mbits/sec
> > [  1] 3.0000-4.0000 sec  36.5 MBytes   307 Mbits/sec
> > [  1] 4.0000-5.0000 sec  37.1 MBytes   311 Mbits/sec
> > [  1] 5.0000-6.0000 sec  37.2 MBytes   312 Mbits/sec
> > [  1] 6.0000-7.0000 sec  37.1 MBytes   311 Mbits/sec
> > [  1] 7.0000-8.0000 sec  37.1 MBytes   311 Mbits/sec
> > [  1] 8.0000-9.0000 sec  37.1 MBytes   312 Mbits/sec
> > [  1] 9.0000-10.0000 sec  37.2 MBytes   312 Mbits/sec
> > [  1] 0.0000-10.0097 sec   369 MBytes   310 Mbits/sec
> >
> > Regards,
> > Shenwei
> >
> >
> >>>> By small packets, do you mean those under the copybreak limit?
> >>>>
> >>>> Please provide some benchmark numbers with your next patchset.
> >>>
> >>> Yes, the packet size is 64 bytes and it is under the copybreak limit.
> >>> As the impact is not significant, I would prefer to remove the
> >>> copybreak  logic.
> >>
> >> +1 to removing this logic if possible, due to maintenance cost.
> >>
> >> --Jesper
> >
Jesper Dangaard Brouer Oct. 4, 2022, 11:21 a.m. UTC | #21
On 03/10/2022 14.49, Shenwei Wang wrote:
> Hi Jesper,
> 
>>>> On mvneta driver/platform we saw huge speedup replacing:
>>>>
>>>>      page_pool_release_page(rxq->page_pool, page); with
>>>>      skb_mark_for_recycle(skb);
>>>>
> 
> After replacing the page_pool_release_page with the
> skb_mark_for_recycle, I found something confused me a little in the
> testing result. >
> I tested with the sample app of "xdpsock" under two modes: 
>  1. Native (xdpsock -i eth0). 
>  2. Skb-mode (xdpsock -S -i eth0).
Great that you are also testing AF_XDP, but do you have a particular
use-case that needs AF_XDP on this board?

What packet size are used in below results?

> The following are the testing result:
 >
>       With page_pool_release_page (pps)  With skb_mark_for_recycle (pps)
> 
>   SKB-Mode                          90K                            200K
>   Native                           190K                            190K
> 

The default AF_XDP test with xdpsock is rxdrop IIRC.

Can you test the normal XDP code path and do a XDP_DROP test via the
samples tool 'xdp_rxq_info' and cmdline:

   sudo ./xdp_rxq_info --dev eth42 --act XDP_DROP --read

And then same with --skb-mode

> The skb_mark_for_recycle solution boosted the performance of SKB-Mode
> to 200K+ PPS. That is even higher than the performance of Native
> solution.  Is this result reasonable? Do you have any clue why the
> SKB-Mode performance can go higher than that of Native one?
I might be able to explain this (Cc. AF_XDP maintainers to keep me honest).

When you say "native" *AF_XDP* that isn't Zero-Copy AF_XDP.

Sure, XDP runs in native driver mode and redirects the raw frames into 
the AF_XDP socket, but as this isn't zero-copy AF_XDP. Thus, the packets 
needs to be copied into the AF_XDP buffers.

As soon as the frame or SKB (for generic XDP) have been copied it is 
released/freed by AF_XDP/xsk code (either via xdp_return_buff() or 
consume_skb()). Thus, it looks like it really pays off to recycle the 
frame via page_pool, also for the SKB consume_skb() case.

I am still a little surprised that to can be faster than native AF_XDP, 
as the SKB-mode ("XDP-generic") needs to call through lot more software 
layers and convert the SKB to look like an xdp_buff.

--Jesper



>> -----Original Message-----
>> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
>> Sent: Thursday, September 29, 2022 1:55 PM
>> To: Shenwei Wang <shenwei.wang@nxp.com>; Jesper Dangaard Brouer
>> <jbrouer@redhat.com>; Andrew Lunn <andrew@lunn.ch>
>> Cc: brouer@redhat.com; Joakim Zhang <qiangqing.zhang@nxp.com>; David S.
>> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
>> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
>> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
>> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
>> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
>> kernel@vger.kernel.org; imx@lists.linux.dev
>> Subject: Re: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
>>
>> Caution: EXT Email
>>
>> On 29/09/2022 17.52, Shenwei Wang wrote:
>>>
>>>> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
>>>>
>>>> On 29/09/2022 15.26, Shenwei Wang wrote:
>>>>>
>>>>>> From: Andrew Lunn <andrew@lunn.ch>
>>>>>> Sent: Thursday, September 29, 2022 8:23 AM
>>>> [...]
>>>>>>
>>>>>>> I actually did some compare testing regarding the page pool for
>>>>>>> normal traffic.  So far I don't see significant improvement in the
>>>>>>> current implementation. The performance for large packets improves
>>>>>>> a little, and the performance for small packets get a little worse.
>>>>>>
>>>>>> What hardware was this for? imx51? imx6? imx7 Vybrid? These all use the
>> FEC.
>>>>>
>>>>> I tested on imx8qxp platform. It is ARM64.
>>>>
>>>> On mvneta driver/platform we saw huge speedup replacing:
>>>>
>>>>      page_pool_release_page(rxq->page_pool, page); with
>>>>      skb_mark_for_recycle(skb);
>>>>
>>>> As I mentioned: Today page_pool have SKB recycle support (you might
>>>> have looked at drivers that didn't utilize this yet), thus you don't
>>>> need to release the page (page_pool_release_page) here.  Instead you
>>>> could simply mark the SKB for recycling, unless driver does some page refcnt
>> tricks I didn't notice.
>>>>
>>>> On the mvneta driver/platform the DMA unmap (in
>>>> page_pool_release_page) was very expensive. This imx8qxp platform
>>>> might have faster DMA unmap in case is it cache-coherent.
>>>>
>>>> I would be very interested in knowing if skb_mark_for_recycle() helps
>>>> on this platform, for normal network stack performance.
>>>>
>>>
>>> Did a quick compare testing for the following 3 scenarios:
>>
>> Thanks for doing this! :-)
>>
>>> 1. original implementation
>>>
>>> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
>>> ------------------------------------------------------------
>>> Client connecting to 10.81.16.245, TCP port 5001 TCP window size:  416
>>> KByte (WARNING: requested 1.91 MByte)
>>> ------------------------------------------------------------
>>> [  1] local 10.81.17.20 port 49154 connected with 10.81.16.245 port 5001
>>> [ ID] Interval       Transfer     Bandwidth
>>> [  1] 0.0000-1.0000 sec   104 MBytes   868 Mbits/sec
>>> [  1] 1.0000-2.0000 sec   105 MBytes   878 Mbits/sec
>>> [  1] 2.0000-3.0000 sec   105 MBytes   881 Mbits/sec
>>> [  1] 3.0000-4.0000 sec   105 MBytes   879 Mbits/sec
>>> [  1] 4.0000-5.0000 sec   105 MBytes   878 Mbits/sec
>>> [  1] 5.0000-6.0000 sec   105 MBytes   878 Mbits/sec
>>> [  1] 6.0000-7.0000 sec   104 MBytes   875 Mbits/sec
>>> [  1] 7.0000-8.0000 sec   104 MBytes   875 Mbits/sec
>>> [  1] 8.0000-9.0000 sec   104 MBytes   873 Mbits/sec
>>> [  1] 9.0000-10.0000 sec   104 MBytes   875 Mbits/sec
>>> [  1] 0.0000-10.0073 sec  1.02 GBytes   875 Mbits/sec
>>>
>>> 2. Page pool with page_pool_release_page
>>>
>>> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
>>> ------------------------------------------------------------
>>> Client connecting to 10.81.16.245, TCP port 5001 TCP window size:  416
>>> KByte (WARNING: requested 1.91 MByte)
>>> ------------------------------------------------------------
>>> [  1] local 10.81.17.20 port 35924 connected with 10.81.16.245 port 5001
>>> [ ID] Interval       Transfer     Bandwidth
>>> [  1] 0.0000-1.0000 sec   101 MBytes   849 Mbits/sec
>>> [  1] 1.0000-2.0000 sec   102 MBytes   860 Mbits/sec
>>> [  1] 2.0000-3.0000 sec   102 MBytes   860 Mbits/sec
>>> [  1] 3.0000-4.0000 sec   102 MBytes   859 Mbits/sec
>>> [  1] 4.0000-5.0000 sec   103 MBytes   863 Mbits/sec
>>> [  1] 5.0000-6.0000 sec   103 MBytes   864 Mbits/sec
>>> [  1] 6.0000-7.0000 sec   103 MBytes   863 Mbits/sec
>>> [  1] 7.0000-8.0000 sec   103 MBytes   865 Mbits/sec
>>> [  1] 8.0000-9.0000 sec   103 MBytes   862 Mbits/sec
>>> [  1] 9.0000-10.0000 sec   102 MBytes   856 Mbits/sec
>>> [  1] 0.0000-10.0246 sec  1.00 GBytes   858 Mbits/sec
>>>
>>>
>>> 3. page pool with skb_mark_for_recycle
>>>
>>> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
>>> ------------------------------------------------------------
>>> Client connecting to 10.81.16.245, TCP port 5001 TCP window size:  416
>>> KByte (WARNING: requested 1.91 MByte)
>>> ------------------------------------------------------------
>>> [  1] local 10.81.17.20 port 42724 connected with 10.81.16.245 port 5001
>>> [ ID] Interval       Transfer     Bandwidth
>>> [  1] 0.0000-1.0000 sec   111 MBytes   931 Mbits/sec
>>> [  1] 1.0000-2.0000 sec   112 MBytes   935 Mbits/sec
>>> [  1] 2.0000-3.0000 sec   111 MBytes   934 Mbits/sec
>>> [  1] 3.0000-4.0000 sec   111 MBytes   934 Mbits/sec
>>> [  1] 4.0000-5.0000 sec   111 MBytes   934 Mbits/sec
>>> [  1] 5.0000-6.0000 sec   112 MBytes   935 Mbits/sec
>>> [  1] 6.0000-7.0000 sec   111 MBytes   934 Mbits/sec
>>> [  1] 7.0000-8.0000 sec   111 MBytes   933 Mbits/sec
>>> [  1] 8.0000-9.0000 sec   112 MBytes   935 Mbits/sec
>>> [  1] 9.0000-10.0000 sec   111 MBytes   933 Mbits/sec
>>> [  1] 0.0000-10.0069 sec  1.09 GBytes   934 Mbits/sec
>>
>> This is a very significant performance improvement (page pool with
>> skb_mark_for_recycle).  This is very close to the max goodput for a 1Gbit/s link.
>>
>>
>>> For small packet size (64 bytes), all three cases have almost the same result:
>>>
>>
>> To me this indicate, that the DMA map/unmap operations on this platform are
>> indeed more expensive on larger packets.  Given this is what page_pool does,
>> keeping the DMA mapping intact when recycling.
>>
>> Driver still need DMA-sync, although I notice you set page_pool feature flag
>> PP_FLAG_DMA_SYNC_DEV, this is good as page_pool will try to reduce sync size
>> where possible. E.g. in this SKB case will reduce the DMA-sync to the
>> max_len=FEC_ENET_RX_FRSIZE which should also help on performance.
>>
>>
>>> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1 -l 64
>>> ------------------------------------------------------------
>>> Client connecting to 10.81.16.245, TCP port 5001 TCP window size:  416
>>> KByte (WARNING: requested 1.91 MByte)
>>> ------------------------------------------------------------
>>> [  1] local 10.81.17.20 port 58204 connected with 10.81.16.245 port 5001
>>> [ ID] Interval       Transfer     Bandwidth
>>> [  1] 0.0000-1.0000 sec  36.9 MBytes   309 Mbits/sec
>>> [  1] 1.0000-2.0000 sec  36.6 MBytes   307 Mbits/sec
>>> [  1] 2.0000-3.0000 sec  36.6 MBytes   307 Mbits/sec
>>> [  1] 3.0000-4.0000 sec  36.5 MBytes   307 Mbits/sec
>>> [  1] 4.0000-5.0000 sec  37.1 MBytes   311 Mbits/sec
>>> [  1] 5.0000-6.0000 sec  37.2 MBytes   312 Mbits/sec
>>> [  1] 6.0000-7.0000 sec  37.1 MBytes   311 Mbits/sec
>>> [  1] 7.0000-8.0000 sec  37.1 MBytes   311 Mbits/sec
>>> [  1] 8.0000-9.0000 sec  37.1 MBytes   312 Mbits/sec
>>> [  1] 9.0000-10.0000 sec  37.2 MBytes   312 Mbits/sec
>>> [  1] 0.0000-10.0097 sec   369 MBytes   310 Mbits/sec
>>>
>>> Regards,
>>> Shenwei
>>>
>>>
>>>>>> By small packets, do you mean those under the copybreak limit?
>>>>>>
>>>>>> Please provide some benchmark numbers with your next patchset.
>>>>>
>>>>> Yes, the packet size is 64 bytes and it is under the copybreak limit.
>>>>> As the impact is not significant, I would prefer to remove the
>>>>> copybreak  logic.
>>>>
>>>> +1 to removing this logic if possible, due to maintenance cost.
>>>>
>>>> --Jesper
>>>
>
Shenwei Wang Oct. 4, 2022, 1:12 p.m. UTC | #22
> -----Original Message-----
> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Sent: Tuesday, October 4, 2022 6:22 AM
> To: Shenwei Wang <shenwei.wang@nxp.com>; Jesper Dangaard Brouer
> <jbrouer@redhat.com>; Andrew Lunn <andrew@lunn.ch>
> Cc: brouer@redhat.com; Joakim Zhang <qiangqing.zhang@nxp.com>; David S.
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev; Magnus Karlsson
> <magnus.karlsson@gmail.com>; Björn Töpel <bjorn@kernel.org>; Ilias
> Apalodimas <ilias.apalodimas@linaro.org>
> Subject: Re: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> Caution: EXT Email
> 
> On 03/10/2022 14.49, Shenwei Wang wrote:
> > Hi Jesper,
> >
> >>>> On mvneta driver/platform we saw huge speedup replacing:
> >>>>
> >>>>      page_pool_release_page(rxq->page_pool, page); with
> >>>>      skb_mark_for_recycle(skb);
> >>>>
> >
> > After replacing the page_pool_release_page with the
> > skb_mark_for_recycle, I found something confused me a little in the
> > testing result. > I tested with the sample app of "xdpsock" under two
> > modes:
> >  1. Native (xdpsock -i eth0).
> >  2. Skb-mode (xdpsock -S -i eth0).
> Great that you are also testing AF_XDP, but do you have a particular use-case
> that needs AF_XDP on this board?

The purpose is to provide our customers an alternative solution to the current DPDK implementation.

> 
> What packet size are used in below results?

The packets were generated by pktgen_sample03_burst_single_flow.sh, and its size is 60 bytes 0 frags.

> 
> > The following are the testing result:
>  >
> >       With page_pool_release_page (pps)  With skb_mark_for_recycle
> > (pps)
> >
> >   SKB-Mode                          90K                            200K
> >   Native                           190K                            190K
> >
> 
> The default AF_XDP test with xdpsock is rxdrop IIRC.
> 
> Can you test the normal XDP code path and do a XDP_DROP test via the samples
> tool 'xdp_rxq_info' and cmdline:
> 
>    sudo ./xdp_rxq_info --dev eth42 --act XDP_DROP --read
> 
> And then same with --skb-mode

I haven't tested xdp_rxq_info yet, and will have a try sometime later today. However, for the XDP_DROP test, I 
did try xdp2 test case, and the testing result looks reasonable. The performance of Native mode is much higher than skb-mode.

# xdp2 eth0
 proto 0:     475362 pkt/s

# xdp2 -S eth0             (page_pool_release_page solution)
 proto 17:     71999 pkt/s

# xdp2 -S eth0             (skb_mark_for_recycle solution)
 proto 17:     72228 pkt/s

> 
> > The skb_mark_for_recycle solution boosted the performance of SKB-Mode
> > to 200K+ PPS. That is even higher than the performance of Native
> > solution.  Is this result reasonable? Do you have any clue why the
> > SKB-Mode performance can go higher than that of Native one?
> I might be able to explain this (Cc. AF_XDP maintainers to keep me honest).
> 
> When you say "native" *AF_XDP* that isn't Zero-Copy AF_XDP.
> 

Right. Zero-copy hasn't been implemented yet.

> Sure, XDP runs in native driver mode and redirects the raw frames into the
> AF_XDP socket, but as this isn't zero-copy AF_XDP. Thus, the packets needs to be
> copied into the AF_XDP buffers.
> 
> As soon as the frame or SKB (for generic XDP) have been copied it is
> released/freed by AF_XDP/xsk code (either via xdp_return_buff() or
> consume_skb()). Thus, it looks like it really pays off to recycle the frame via
> page_pool, also for the SKB consume_skb() case.
> 
> I am still a little surprised that to can be faster than native AF_XDP, as the SKB-
> mode ("XDP-generic") needs to call through lot more software layers and
> convert the SKB to look like an xdp_buff.

That's what I can't understand right now too.

Thanks very much for the explanations!
Shenwei

> 
> --Jesper
> 
> 
> 
> >> -----Original Message-----
> >> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> >> Sent: Thursday, September 29, 2022 1:55 PM
> >> To: Shenwei Wang <shenwei.wang@nxp.com>; Jesper Dangaard Brouer
> >> <jbrouer@redhat.com>; Andrew Lunn <andrew@lunn.ch>
> >> Cc: brouer@redhat.com; Joakim Zhang <qiangqing.zhang@nxp.com>; David
> S.
> >> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>;
> >> Jakub Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>;
> >> Alexei Starovoitov <ast@kernel.org>; Daniel Borkmann
> >> <daniel@iogearbox.net>; Jesper Dangaard Brouer <hawk@kernel.org>;
> >> John Fastabend <john.fastabend@gmail.com>; netdev@vger.kernel.org;
> >> linux- kernel@vger.kernel.org; imx@lists.linux.dev
> >> Subject: Re: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> >>
> >> Caution: EXT Email
> >>
> >> On 29/09/2022 17.52, Shenwei Wang wrote:
> >>>
> >>>> From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> >>>>
> >>>> On 29/09/2022 15.26, Shenwei Wang wrote:
> >>>>>
> >>>>>> From: Andrew Lunn <andrew@lunn.ch>
> >>>>>> Sent: Thursday, September 29, 2022 8:23 AM
> >>>> [...]
> >>>>>>
> >>>>>>> I actually did some compare testing regarding the page pool for
> >>>>>>> normal traffic.  So far I don't see significant improvement in
> >>>>>>> the current implementation. The performance for large packets
> >>>>>>> improves a little, and the performance for small packets get a little
> worse.
> >>>>>>
> >>>>>> What hardware was this for? imx51? imx6? imx7 Vybrid? These all
> >>>>>> use the
> >> FEC.
> >>>>>
> >>>>> I tested on imx8qxp platform. It is ARM64.
> >>>>
> >>>> On mvneta driver/platform we saw huge speedup replacing:
> >>>>
> >>>>      page_pool_release_page(rxq->page_pool, page); with
> >>>>      skb_mark_for_recycle(skb);
> >>>>
> >>>> As I mentioned: Today page_pool have SKB recycle support (you might
> >>>> have looked at drivers that didn't utilize this yet), thus you
> >>>> don't need to release the page (page_pool_release_page) here.
> >>>> Instead you could simply mark the SKB for recycling, unless driver
> >>>> does some page refcnt
> >> tricks I didn't notice.
> >>>>
> >>>> On the mvneta driver/platform the DMA unmap (in
> >>>> page_pool_release_page) was very expensive. This imx8qxp platform
> >>>> might have faster DMA unmap in case is it cache-coherent.
> >>>>
> >>>> I would be very interested in knowing if skb_mark_for_recycle()
> >>>> helps on this platform, for normal network stack performance.
> >>>>
> >>>
> >>> Did a quick compare testing for the following 3 scenarios:
> >>
> >> Thanks for doing this! :-)
> >>
> >>> 1. original implementation
> >>>
> >>> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> >>> ------------------------------------------------------------
> >>> Client connecting to 10.81.16.245, TCP port 5001 TCP window size:
> >>> 416 KByte (WARNING: requested 1.91 MByte)
> >>> ------------------------------------------------------------
> >>> [  1] local 10.81.17.20 port 49154 connected with 10.81.16.245 port 5001
> >>> [ ID] Interval       Transfer     Bandwidth
> >>> [  1] 0.0000-1.0000 sec   104 MBytes   868 Mbits/sec
> >>> [  1] 1.0000-2.0000 sec   105 MBytes   878 Mbits/sec
> >>> [  1] 2.0000-3.0000 sec   105 MBytes   881 Mbits/sec
> >>> [  1] 3.0000-4.0000 sec   105 MBytes   879 Mbits/sec
> >>> [  1] 4.0000-5.0000 sec   105 MBytes   878 Mbits/sec
> >>> [  1] 5.0000-6.0000 sec   105 MBytes   878 Mbits/sec
> >>> [  1] 6.0000-7.0000 sec   104 MBytes   875 Mbits/sec
> >>> [  1] 7.0000-8.0000 sec   104 MBytes   875 Mbits/sec
> >>> [  1] 8.0000-9.0000 sec   104 MBytes   873 Mbits/sec
> >>> [  1] 9.0000-10.0000 sec   104 MBytes   875 Mbits/sec
> >>> [  1] 0.0000-10.0073 sec  1.02 GBytes   875 Mbits/sec
> >>>
> >>> 2. Page pool with page_pool_release_page
> >>>
> >>> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> >>> ------------------------------------------------------------
> >>> Client connecting to 10.81.16.245, TCP port 5001 TCP window size:
> >>> 416 KByte (WARNING: requested 1.91 MByte)
> >>> ------------------------------------------------------------
> >>> [  1] local 10.81.17.20 port 35924 connected with 10.81.16.245 port 5001
> >>> [ ID] Interval       Transfer     Bandwidth
> >>> [  1] 0.0000-1.0000 sec   101 MBytes   849 Mbits/sec
> >>> [  1] 1.0000-2.0000 sec   102 MBytes   860 Mbits/sec
> >>> [  1] 2.0000-3.0000 sec   102 MBytes   860 Mbits/sec
> >>> [  1] 3.0000-4.0000 sec   102 MBytes   859 Mbits/sec
> >>> [  1] 4.0000-5.0000 sec   103 MBytes   863 Mbits/sec
> >>> [  1] 5.0000-6.0000 sec   103 MBytes   864 Mbits/sec
> >>> [  1] 6.0000-7.0000 sec   103 MBytes   863 Mbits/sec
> >>> [  1] 7.0000-8.0000 sec   103 MBytes   865 Mbits/sec
> >>> [  1] 8.0000-9.0000 sec   103 MBytes   862 Mbits/sec
> >>> [  1] 9.0000-10.0000 sec   102 MBytes   856 Mbits/sec
> >>> [  1] 0.0000-10.0246 sec  1.00 GBytes   858 Mbits/sec
> >>>
> >>>
> >>> 3. page pool with skb_mark_for_recycle
> >>>
> >>> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
> >>> ------------------------------------------------------------
> >>> Client connecting to 10.81.16.245, TCP port 5001 TCP window size:
> >>> 416 KByte (WARNING: requested 1.91 MByte)
> >>> ------------------------------------------------------------
> >>> [  1] local 10.81.17.20 port 42724 connected with 10.81.16.245 port 5001
> >>> [ ID] Interval       Transfer     Bandwidth
> >>> [  1] 0.0000-1.0000 sec   111 MBytes   931 Mbits/sec
> >>> [  1] 1.0000-2.0000 sec   112 MBytes   935 Mbits/sec
> >>> [  1] 2.0000-3.0000 sec   111 MBytes   934 Mbits/sec
> >>> [  1] 3.0000-4.0000 sec   111 MBytes   934 Mbits/sec
> >>> [  1] 4.0000-5.0000 sec   111 MBytes   934 Mbits/sec
> >>> [  1] 5.0000-6.0000 sec   112 MBytes   935 Mbits/sec
> >>> [  1] 6.0000-7.0000 sec   111 MBytes   934 Mbits/sec
> >>> [  1] 7.0000-8.0000 sec   111 MBytes   933 Mbits/sec
> >>> [  1] 8.0000-9.0000 sec   112 MBytes   935 Mbits/sec
> >>> [  1] 9.0000-10.0000 sec   111 MBytes   933 Mbits/sec
> >>> [  1] 0.0000-10.0069 sec  1.09 GBytes   934 Mbits/sec
> >>
> >> This is a very significant performance improvement (page pool with
> >> skb_mark_for_recycle).  This is very close to the max goodput for a 1Gbit/s
> link.
> >>
> >>
> >>> For small packet size (64 bytes), all three cases have almost the same result:
> >>>
> >>
> >> To me this indicate, that the DMA map/unmap operations on this
> >> platform are indeed more expensive on larger packets.  Given this is
> >> what page_pool does, keeping the DMA mapping intact when recycling.
> >>
> >> Driver still need DMA-sync, although I notice you set page_pool
> >> feature flag PP_FLAG_DMA_SYNC_DEV, this is good as page_pool will try
> >> to reduce sync size where possible. E.g. in this SKB case will reduce
> >> the DMA-sync to the max_len=FEC_ENET_RX_FRSIZE which should also help
> on performance.
> >>
> >>
> >>> shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1 -l 64
> >>> ------------------------------------------------------------
> >>> Client connecting to 10.81.16.245, TCP port 5001 TCP window size:
> >>> 416 KByte (WARNING: requested 1.91 MByte)
> >>> ------------------------------------------------------------
> >>> [  1] local 10.81.17.20 port 58204 connected with 10.81.16.245 port 5001
> >>> [ ID] Interval       Transfer     Bandwidth
> >>> [  1] 0.0000-1.0000 sec  36.9 MBytes   309 Mbits/sec
> >>> [  1] 1.0000-2.0000 sec  36.6 MBytes   307 Mbits/sec
> >>> [  1] 2.0000-3.0000 sec  36.6 MBytes   307 Mbits/sec
> >>> [  1] 3.0000-4.0000 sec  36.5 MBytes   307 Mbits/sec
> >>> [  1] 4.0000-5.0000 sec  37.1 MBytes   311 Mbits/sec
> >>> [  1] 5.0000-6.0000 sec  37.2 MBytes   312 Mbits/sec
> >>> [  1] 6.0000-7.0000 sec  37.1 MBytes   311 Mbits/sec
> >>> [  1] 7.0000-8.0000 sec  37.1 MBytes   311 Mbits/sec
> >>> [  1] 8.0000-9.0000 sec  37.1 MBytes   312 Mbits/sec
> >>> [  1] 9.0000-10.0000 sec  37.2 MBytes   312 Mbits/sec
> >>> [  1] 0.0000-10.0097 sec   369 MBytes   310 Mbits/sec
> >>>
> >>> Regards,
> >>> Shenwei
> >>>
> >>>
> >>>>>> By small packets, do you mean those under the copybreak limit?
> >>>>>>
> >>>>>> Please provide some benchmark numbers with your next patchset.
> >>>>>
> >>>>> Yes, the packet size is 64 bytes and it is under the copybreak limit.
> >>>>> As the impact is not significant, I would prefer to remove the
> >>>>> copybreak  logic.
> >>>>
> >>>> +1 to removing this logic if possible, due to maintenance cost.
> >>>>
> >>>> --Jesper
> >>>
> >
Shenwei Wang Oct. 4, 2022, 1:34 p.m. UTC | #23
> -----Original Message-----
> From: Shenwei Wang
> Sent: Tuesday, October 4, 2022 8:13 AM
> To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
...
> I haven't tested xdp_rxq_info yet, and will have a try sometime later today.
> However, for the XDP_DROP test, I did try xdp2 test case, and the testing result
> looks reasonable. The performance of Native mode is much higher than skb-
> mode.
> 
> # xdp2 eth0
>  proto 0:     475362 pkt/s
> 
> # xdp2 -S eth0             (page_pool_release_page solution)
>  proto 17:     71999 pkt/s
> 
> # xdp2 -S eth0             (skb_mark_for_recycle solution)
>  proto 17:     72228 pkt/s
> 

Correction for xdp2 -S eth0	(skb_mark_for_recycle solution)
proto 0:          0 pkt/s
proto 17:     122473 pkt/s

Thanks,
Shenwei
Shenwei Wang Oct. 5, 2022, 12:40 p.m. UTC | #24
Hi Jesper,

Here is the summary of "xdp_rxq_info" testing.

              skb_mark_for_recycle           page_pool_release_page

             Native        SKB-Mode           Native          SKB-Mode
XDP_DROP     460K           220K              460K             102K
XDP_PASS     80K            113K              60K              62K


The following are the testing log.

Thanks,
Shenwei

### skb_mark_for_recycle solution ###

./xdp_rxq_info --dev eth0 --act XDP_DROP --read

Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       466,553     0
XDP-RX CPU      total   466,553

./xdp_rxq_info -S --dev eth0 --act XDP_DROP --read

Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       226,272     0
XDP-RX CPU      total   226,272

./xdp_rxq_info --dev eth0 --act XDP_PASS --read

Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       80,518      0
XDP-RX CPU      total   80,518

./xdp_rxq_info -S --dev eth0 --act XDP_PASS --read

Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       113,681     0
XDP-RX CPU      total   113,681


### page_pool_release_page solution ###

./xdp_rxq_info --dev eth0 --act XDP_DROP --read

Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       463,145     0
XDP-RX CPU      total   463,145

./xdp_rxq_info -S --dev eth0 --act XDP_DROP --read

Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       104,443     0
XDP-RX CPU      total   104,443

./xdp_rxq_info --dev eth0 --act XDP_PASS --read

Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       60,539      0
XDP-RX CPU      total   60,539

./xdp_rxq_info -S --dev eth0 --act XDP_PASS --read

Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       62,566      0
XDP-RX CPU      total   62,566

> -----Original Message-----
> From: Shenwei Wang
> Sent: Tuesday, October 4, 2022 8:34 AM
> To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
> <andrew@lunn.ch>
> Cc: brouer@redhat.com; David S. Miller <davem@davemloft.net>; Eric Dumazet
> <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni
> <pabeni@redhat.com>; Alexei Starovoitov <ast@kernel.org>; Daniel Borkmann
> <daniel@iogearbox.net>; Jesper Dangaard Brouer <hawk@kernel.org>; John
> Fastabend <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev; Magnus Karlsson
> <magnus.karlsson@gmail.com>; Björn Töpel <bjorn@kernel.org>; Ilias
> Apalodimas <ilias.apalodimas@linaro.org>
> Subject: RE: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> 
> 
> > -----Original Message-----
> > From: Shenwei Wang
> > Sent: Tuesday, October 4, 2022 8:13 AM
> > To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
> ...
> > I haven't tested xdp_rxq_info yet, and will have a try sometime later today.
> > However, for the XDP_DROP test, I did try xdp2 test case, and the
> > testing result looks reasonable. The performance of Native mode is
> > much higher than skb- mode.
> >
> > # xdp2 eth0
> >  proto 0:     475362 pkt/s
> >
> > # xdp2 -S eth0             (page_pool_release_page solution)
> >  proto 17:     71999 pkt/s
> >
> > # xdp2 -S eth0             (skb_mark_for_recycle solution)
> >  proto 17:     72228 pkt/s
> >
> 
> Correction for xdp2 -S eth0	(skb_mark_for_recycle solution)
> proto 0:          0 pkt/s
> proto 17:     122473 pkt/s
> 
> Thanks,
> Shenwei
Jesper Dangaard Brouer Oct. 6, 2022, 8:37 a.m. UTC | #25
On 05/10/2022 14.40, Shenwei Wang wrote:
> Hi Jesper,
> 
> Here is the summary of "xdp_rxq_info" testing.
> 
>                skb_mark_for_recycle           page_pool_release_page
> 
>               Native        SKB-Mode           Native          SKB-Mode
> XDP_DROP     460K           220K              460K             102K
> XDP_PASS     80K            113K              60K              62K
> 

It is very pleasing to see the *huge* performance benefit that page_pool
provide when recycling pages for SKBs (via skb_mark_for_recycle).
I did expect a performance boost, but not around a x2 performance boost.

I guess this platform have a larger overhead for DMA-mapping and
page-allocation.

IMHO it would be valuable to include this result as part of the patch
description when you post the XDP patch again.

Only strange result is XDP_PASS 'Native' is slower that 'SKB-mode'. I
cannot explain why, as XDP_PASS essentially does nothing and just follow
normal driver code to netstack.

Thanks a lot for doing these tests.
--Jesper

> The following are the testing log.
> 
> Thanks,
> Shenwei
> 
> ### skb_mark_for_recycle solution ###
> 
> ./xdp_rxq_info --dev eth0 --act XDP_DROP --read
> 
> Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      0       466,553     0
> XDP-RX CPU      total   466,553
> 
> ./xdp_rxq_info -S --dev eth0 --act XDP_DROP --read
> 
> Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      0       226,272     0
> XDP-RX CPU      total   226,272
> 
> ./xdp_rxq_info --dev eth0 --act XDP_PASS --read
> 
> Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      0       80,518      0
> XDP-RX CPU      total   80,518
> 
> ./xdp_rxq_info -S --dev eth0 --act XDP_PASS --read
> 
> Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      0       113,681     0
> XDP-RX CPU      total   113,681
> 
> 
> ### page_pool_release_page solution ###
> 
> ./xdp_rxq_info --dev eth0 --act XDP_DROP --read
> 
> Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      0       463,145     0
> XDP-RX CPU      total   463,145
> 
> ./xdp_rxq_info -S --dev eth0 --act XDP_DROP --read
> 
> Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      0       104,443     0
> XDP-RX CPU      total   104,443
> 
> ./xdp_rxq_info --dev eth0 --act XDP_PASS --read
> 
> Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      0       60,539      0
> XDP-RX CPU      total   60,539
> 
> ./xdp_rxq_info -S --dev eth0 --act XDP_PASS --read
> 
> Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> XDP stats       CPU     pps         issue-pps
> XDP-RX CPU      0       62,566      0
> XDP-RX CPU      total   62,566
> 
>> -----Original Message-----
>> From: Shenwei Wang
>> Sent: Tuesday, October 4, 2022 8:34 AM
>> To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
>> <andrew@lunn.ch>
>> Cc: brouer@redhat.com; David S. Miller <davem@davemloft.net>; Eric Dumazet
>> <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni
>> <pabeni@redhat.com>; Alexei Starovoitov <ast@kernel.org>; Daniel Borkmann
>> <daniel@iogearbox.net>; Jesper Dangaard Brouer <hawk@kernel.org>; John
>> Fastabend <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
>> kernel@vger.kernel.org; imx@lists.linux.dev; Magnus Karlsson
>> <magnus.karlsson@gmail.com>; Björn Töpel <bjorn@kernel.org>; Ilias
>> Apalodimas <ilias.apalodimas@linaro.org>
>> Subject: RE: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
>>
>>
>>
>>> -----Original Message-----
>>> From: Shenwei Wang
>>> Sent: Tuesday, October 4, 2022 8:13 AM
>>> To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
>> ...
>>> I haven't tested xdp_rxq_info yet, and will have a try sometime later today.
>>> However, for the XDP_DROP test, I did try xdp2 test case, and the
>>> testing result looks reasonable. The performance of Native mode is
>>> much higher than skb- mode.
>>>
>>> # xdp2 eth0
>>>   proto 0:     475362 pkt/s
>>>
>>> # xdp2 -S eth0             (page_pool_release_page solution)
>>>   proto 17:     71999 pkt/s
>>>
>>> # xdp2 -S eth0             (skb_mark_for_recycle solution)
>>>   proto 17:     72228 pkt/s
>>>
>>
>> Correction for xdp2 -S eth0	(skb_mark_for_recycle solution)
>> proto 0:          0 pkt/s
>> proto 17:     122473 pkt/s
>>
>> Thanks,
>> Shenwei
>
Ilias Apalodimas Oct. 7, 2022, 8:08 a.m. UTC | #26
Hi Jesper,

On Thu, 6 Oct 2022 at 11:37, Jesper Dangaard Brouer <jbrouer@redhat.com> wrote:
>
>
>
> On 05/10/2022 14.40, Shenwei Wang wrote:
> > Hi Jesper,
> >
> > Here is the summary of "xdp_rxq_info" testing.
> >
> >                skb_mark_for_recycle           page_pool_release_page
> >
> >               Native        SKB-Mode           Native          SKB-Mode
> > XDP_DROP     460K           220K              460K             102K
> > XDP_PASS     80K            113K              60K              62K
> >
>
> It is very pleasing to see the *huge* performance benefit that page_pool
> provide when recycling pages for SKBs (via skb_mark_for_recycle).
> I did expect a performance boost, but not around a x2 performance boost.

Indeed that's a pleasant surprise.  Keep in mind that if we convert
more driver we
can also get rid of copy_break code sprinkled around in drivers.

Thanks
/Ilias
>
> I guess this platform have a larger overhead for DMA-mapping and
> page-allocation.
>
> IMHO it would be valuable to include this result as part of the patch
> description when you post the XDP patch again.
>
> Only strange result is XDP_PASS 'Native' is slower that 'SKB-mode'. I
> cannot explain why, as XDP_PASS essentially does nothing and just follow
> normal driver code to netstack.
>
> Thanks a lot for doing these tests.
> --Jesper
>
> > The following are the testing log.
> >
> > Thanks,
> > Shenwei
> >
> > ### skb_mark_for_recycle solution ###
> >
> > ./xdp_rxq_info --dev eth0 --act XDP_DROP --read
> >
> > Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> > XDP stats       CPU     pps         issue-pps
> > XDP-RX CPU      0       466,553     0
> > XDP-RX CPU      total   466,553
> >
> > ./xdp_rxq_info -S --dev eth0 --act XDP_DROP --read
> >
> > Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> > XDP stats       CPU     pps         issue-pps
> > XDP-RX CPU      0       226,272     0
> > XDP-RX CPU      total   226,272
> >
> > ./xdp_rxq_info --dev eth0 --act XDP_PASS --read
> >
> > Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> > XDP stats       CPU     pps         issue-pps
> > XDP-RX CPU      0       80,518      0
> > XDP-RX CPU      total   80,518
> >
> > ./xdp_rxq_info -S --dev eth0 --act XDP_PASS --read
> >
> > Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> > XDP stats       CPU     pps         issue-pps
> > XDP-RX CPU      0       113,681     0
> > XDP-RX CPU      total   113,681
> >
> >
> > ### page_pool_release_page solution ###
> >
> > ./xdp_rxq_info --dev eth0 --act XDP_DROP --read
> >
> > Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> > XDP stats       CPU     pps         issue-pps
> > XDP-RX CPU      0       463,145     0
> > XDP-RX CPU      total   463,145
> >
> > ./xdp_rxq_info -S --dev eth0 --act XDP_DROP --read
> >
> > Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> > XDP stats       CPU     pps         issue-pps
> > XDP-RX CPU      0       104,443     0
> > XDP-RX CPU      total   104,443
> >
> > ./xdp_rxq_info --dev eth0 --act XDP_PASS --read
> >
> > Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> > XDP stats       CPU     pps         issue-pps
> > XDP-RX CPU      0       60,539      0
> > XDP-RX CPU      total   60,539
> >
> > ./xdp_rxq_info -S --dev eth0 --act XDP_PASS --read
> >
> > Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> > XDP stats       CPU     pps         issue-pps
> > XDP-RX CPU      0       62,566      0
> > XDP-RX CPU      total   62,566
> >
> >> -----Original Message-----
> >> From: Shenwei Wang
> >> Sent: Tuesday, October 4, 2022 8:34 AM
> >> To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
> >> <andrew@lunn.ch>
> >> Cc: brouer@redhat.com; David S. Miller <davem@davemloft.net>; Eric Dumazet
> >> <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni
> >> <pabeni@redhat.com>; Alexei Starovoitov <ast@kernel.org>; Daniel Borkmann
> >> <daniel@iogearbox.net>; Jesper Dangaard Brouer <hawk@kernel.org>; John
> >> Fastabend <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> >> kernel@vger.kernel.org; imx@lists.linux.dev; Magnus Karlsson
> >> <magnus.karlsson@gmail.com>; Björn Töpel <bjorn@kernel.org>; Ilias
> >> Apalodimas <ilias.apalodimas@linaro.org>
> >> Subject: RE: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> >>
> >>
> >>
> >>> -----Original Message-----
> >>> From: Shenwei Wang
> >>> Sent: Tuesday, October 4, 2022 8:13 AM
> >>> To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
> >> ...
> >>> I haven't tested xdp_rxq_info yet, and will have a try sometime later today.
> >>> However, for the XDP_DROP test, I did try xdp2 test case, and the
> >>> testing result looks reasonable. The performance of Native mode is
> >>> much higher than skb- mode.
> >>>
> >>> # xdp2 eth0
> >>>   proto 0:     475362 pkt/s
> >>>
> >>> # xdp2 -S eth0             (page_pool_release_page solution)
> >>>   proto 17:     71999 pkt/s
> >>>
> >>> # xdp2 -S eth0             (skb_mark_for_recycle solution)
> >>>   proto 17:     72228 pkt/s
> >>>
> >>
> >> Correction for xdp2 -S eth0  (skb_mark_for_recycle solution)
> >> proto 0:          0 pkt/s
> >> proto 17:     122473 pkt/s
> >>
> >> Thanks,
> >> Shenwei
> >
>
Shenwei Wang Oct. 7, 2022, 7:18 p.m. UTC | #27
Hi Jesper and Ilias,

The driver has a macro to configure the RX ring size. After testing
with the RX ring size, I found the strange result may has something
to do with this ring size.

I just tested with the application of xdpsock.
  -- Native here means running command of "xdpsock -i eth0"
  -- SKB-Mode means running command of "xdpsock -S -i eth0"

RX Ring Size       16       32        64       128
      Native      230K     227K      196K      160K
    SKB-Mode      207K     208K      203K      204K

It seems the smaller the ring size, the better the performance. This
is also a strange result to me.

The following is the iperf testing result.

RX Ring Size         16         64       128
iperf              300Mbps    830Mbps   933Mbps

Thanks,
Shenwei

> -----Original Message-----
> From: Ilias Apalodimas <ilias.apalodimas@linaro.org>
> Sent: Friday, October 7, 2022 3:08 AM
> To: Jesper Dangaard Brouer <jbrouer@redhat.com>
> Cc: Shenwei Wang <shenwei.wang@nxp.com>; Andrew Lunn
> <andrew@lunn.ch>; brouer@redhat.com; David S. Miller
> <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Alexei
> Starovoitov <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>;
> Jesper Dangaard Brouer <hawk@kernel.org>; John Fastabend
> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; imx@lists.linux.dev; Magnus Karlsson
> <magnus.karlsson@gmail.com>; Björn Töpel <bjorn@kernel.org>
> Subject: Re: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP support
> 
> Caution: EXT Email
> 
> Hi Jesper,
> 
> On Thu, 6 Oct 2022 at 11:37, Jesper Dangaard Brouer <jbrouer@redhat.com>
> wrote:
> >
> >
> >
> > On 05/10/2022 14.40, Shenwei Wang wrote:
> > > Hi Jesper,
> > >
> > > Here is the summary of "xdp_rxq_info" testing.
> > >
> > >                skb_mark_for_recycle           page_pool_release_page
> > >
> > >               Native        SKB-Mode           Native          SKB-Mode
> > > XDP_DROP     460K           220K              460K             102K
> > > XDP_PASS     80K            113K              60K              62K
> > >
> >
> > It is very pleasing to see the *huge* performance benefit that
> > page_pool provide when recycling pages for SKBs (via skb_mark_for_recycle).
> > I did expect a performance boost, but not around a x2 performance boost.
> 
> Indeed that's a pleasant surprise.  Keep in mind that if we convert more driver
> we can also get rid of copy_break code sprinkled around in drivers.
> 
> Thanks
> /Ilias
> >
> > I guess this platform have a larger overhead for DMA-mapping and
> > page-allocation.
> >
> > IMHO it would be valuable to include this result as part of the patch
> > description when you post the XDP patch again.
> >
> > Only strange result is XDP_PASS 'Native' is slower that 'SKB-mode'. I
> > cannot explain why, as XDP_PASS essentially does nothing and just
> > follow normal driver code to netstack.
> >
> > Thanks a lot for doing these tests.
> > --Jesper
> >
> > > The following are the testing log.
> > >
> > > Thanks,
> > > Shenwei
> > >
> > > ### skb_mark_for_recycle solution ###
> > >
> > > ./xdp_rxq_info --dev eth0 --act XDP_DROP --read
> > >
> > > Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> > > XDP stats       CPU     pps         issue-pps
> > > XDP-RX CPU      0       466,553     0
> > > XDP-RX CPU      total   466,553
> > >
> > > ./xdp_rxq_info -S --dev eth0 --act XDP_DROP --read
> > >
> > > Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> > > XDP stats       CPU     pps         issue-pps
> > > XDP-RX CPU      0       226,272     0
> > > XDP-RX CPU      total   226,272
> > >
> > > ./xdp_rxq_info --dev eth0 --act XDP_PASS --read
> > >
> > > Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> > > XDP stats       CPU     pps         issue-pps
> > > XDP-RX CPU      0       80,518      0
> > > XDP-RX CPU      total   80,518
> > >
> > > ./xdp_rxq_info -S --dev eth0 --act XDP_PASS --read
> > >
> > > Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> > > XDP stats       CPU     pps         issue-pps
> > > XDP-RX CPU      0       113,681     0
> > > XDP-RX CPU      total   113,681
> > >
> > >
> > > ### page_pool_release_page solution ###
> > >
> > > ./xdp_rxq_info --dev eth0 --act XDP_DROP --read
> > >
> > > Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> > > XDP stats       CPU     pps         issue-pps
> > > XDP-RX CPU      0       463,145     0
> > > XDP-RX CPU      total   463,145
> > >
> > > ./xdp_rxq_info -S --dev eth0 --act XDP_DROP --read
> > >
> > > Running XDP on dev:eth0 (ifindex:2) action:XDP_DROP options:read
> > > XDP stats       CPU     pps         issue-pps
> > > XDP-RX CPU      0       104,443     0
> > > XDP-RX CPU      total   104,443
> > >
> > > ./xdp_rxq_info --dev eth0 --act XDP_PASS --read
> > >
> > > Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> > > XDP stats       CPU     pps         issue-pps
> > > XDP-RX CPU      0       60,539      0
> > > XDP-RX CPU      total   60,539
> > >
> > > ./xdp_rxq_info -S --dev eth0 --act XDP_PASS --read
> > >
> > > Running XDP on dev:eth0 (ifindex:2) action:XDP_PASS options:read
> > > XDP stats       CPU     pps         issue-pps
> > > XDP-RX CPU      0       62,566      0
> > > XDP-RX CPU      total   62,566
> > >
> > >> -----Original Message-----
> > >> From: Shenwei Wang
> > >> Sent: Tuesday, October 4, 2022 8:34 AM
> > >> To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
> > >> <andrew@lunn.ch>
> > >> Cc: brouer@redhat.com; David S. Miller <davem@davemloft.net>; Eric
> > >> Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>;
> > >> Paolo Abeni <pabeni@redhat.com>; Alexei Starovoitov
> > >> <ast@kernel.org>; Daniel Borkmann <daniel@iogearbox.net>; Jesper
> > >> Dangaard Brouer <hawk@kernel.org>; John Fastabend
> > >> <john.fastabend@gmail.com>; netdev@vger.kernel.org; linux-
> > >> kernel@vger.kernel.org; imx@lists.linux.dev; Magnus Karlsson
> > >> <magnus.karlsson@gmail.com>; Björn Töpel <bjorn@kernel.org>; Ilias
> > >> Apalodimas <ilias.apalodimas@linaro.org>
> > >> Subject: RE: [EXT] Re: [PATCH 1/1] net: fec: add initial XDP
> > >> support
> > >>
> > >>
> > >>
> > >>> -----Original Message-----
> > >>> From: Shenwei Wang
> > >>> Sent: Tuesday, October 4, 2022 8:13 AM
> > >>> To: Jesper Dangaard Brouer <jbrouer@redhat.com>; Andrew Lunn
> > >> ...
> > >>> I haven't tested xdp_rxq_info yet, and will have a try sometime later today.
> > >>> However, for the XDP_DROP test, I did try xdp2 test case, and the
> > >>> testing result looks reasonable. The performance of Native mode is
> > >>> much higher than skb- mode.
> > >>>
> > >>> # xdp2 eth0
> > >>>   proto 0:     475362 pkt/s
> > >>>
> > >>> # xdp2 -S eth0             (page_pool_release_page solution)
> > >>>   proto 17:     71999 pkt/s
> > >>>
> > >>> # xdp2 -S eth0             (skb_mark_for_recycle solution)
> > >>>   proto 17:     72228 pkt/s
> > >>>
> > >>
> > >> Correction for xdp2 -S eth0  (skb_mark_for_recycle solution)
> > >> proto 0:          0 pkt/s
> > >> proto 17:     122473 pkt/s
> > >>
> > >> Thanks,
> > >> Shenwei
> > >
> >
diff mbox series

Patch

diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index b0100fe3c9e4..f7531503aa95 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -346,8 +346,10 @@  struct bufdesc_ex {
  * the skbuffer directly.
  */
 
+#define FEC_ENET_XDP_HEADROOM	(512) /* XDP_PACKET_HEADROOM + NET_IP_ALIGN) */
+
 #define FEC_ENET_RX_PAGES	256
-#define FEC_ENET_RX_FRSIZE	2048
+#define FEC_ENET_RX_FRSIZE	(PAGE_SIZE - FEC_ENET_XDP_HEADROOM)
 #define FEC_ENET_RX_FRPPG	(PAGE_SIZE / FEC_ENET_RX_FRSIZE)
 #define RX_RING_SIZE		(FEC_ENET_RX_FRPPG * FEC_ENET_RX_PAGES)
 #define FEC_ENET_TX_FRSIZE	2048
@@ -517,6 +519,22 @@  struct bufdesc_prop {
 	unsigned char dsize_log2;
 };
 
+struct fec_enet_priv_txrx_info {
+	int	offset;
+	struct	page *page;
+	struct  sk_buff *skb;
+};
+
+struct fec_enet_xdp_stats {
+	u64	xdp_pass;
+	u64	xdp_drop;
+	u64	xdp_xmit;
+	u64	xdp_redirect;
+	u64	xdp_xmit_err;
+	u64	xdp_tx;
+	u64	xdp_tx_err;
+};
+
 struct fec_enet_priv_tx_q {
 	struct bufdesc_prop bd;
 	unsigned char *tx_bounce[TX_RING_SIZE];
@@ -532,7 +550,15 @@  struct fec_enet_priv_tx_q {
 
 struct fec_enet_priv_rx_q {
 	struct bufdesc_prop bd;
-	struct  sk_buff *rx_skbuff[RX_RING_SIZE];
+	struct  fec_enet_priv_txrx_info rx_skb_info[RX_RING_SIZE];
+
+	/* page_pool */
+	struct page_pool *page_pool;
+	struct xdp_rxq_info xdp_rxq;
+	struct fec_enet_xdp_stats stats;
+
+	/* rx queue number, in the range 0-7 */
+	u8 id;
 };
 
 struct fec_stop_mode_gpr {
@@ -644,6 +670,10 @@  struct fec_enet_private {
 
 	struct imx_sc_ipc *ipc_handle;
 
+	/* XDP BPF Program */
+	unsigned long *af_xdp_zc_qps;
+	struct bpf_prog *xdp_prog;
+
 	u64 ethtool_stats[];
 };
 
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 59921218a8a4..2e30182ed770 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -66,6 +66,8 @@ 
 #include <linux/mfd/syscon.h>
 #include <linux/regmap.h>
 #include <soc/imx/cpuidle.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
 
 #include <asm/cacheflush.h>
 
@@ -87,6 +89,11 @@  static const u16 fec_enet_vlan_pri_to_queue[8] = {0, 0, 1, 1, 1, 2, 2, 2};
 #define FEC_ENET_OPD_V	0xFFF0
 #define FEC_MDIO_PM_TIMEOUT  100 /* ms */
 
+#define FEC_ENET_XDP_PASS          0
+#define FEC_ENET_XDP_CONSUMED      BIT(0)
+#define FEC_ENET_XDP_TX            BIT(1)
+#define FEC_ENET_XDP_REDIR         BIT(2)
+
 struct fec_devinfo {
 	u32 quirks;
 };
@@ -422,6 +429,49 @@  fec_enet_clear_csum(struct sk_buff *skb, struct net_device *ndev)
 	return 0;
 }
 
+static int
+fec_enet_create_page_pool(struct fec_enet_private *fep,
+			  struct fec_enet_priv_rx_q *rxq, int size)
+{
+	struct bpf_prog *xdp_prog = READ_ONCE(fep->xdp_prog);
+	struct page_pool_params pp_params = {
+		.order = 0,
+		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
+		.pool_size = size,
+		.nid = dev_to_node(&fep->pdev->dev),
+		.dev = &fep->pdev->dev,
+		.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE,
+		.offset = FEC_ENET_XDP_HEADROOM,
+		.max_len = FEC_ENET_RX_FRSIZE,
+	};
+	int err;
+
+	rxq->page_pool = page_pool_create(&pp_params);
+	if (IS_ERR(rxq->page_pool)) {
+		err = PTR_ERR(rxq->page_pool);
+		rxq->page_pool = NULL;
+		return err;
+	}
+
+	err = xdp_rxq_info_reg(&rxq->xdp_rxq, fep->netdev, rxq->id, 0);
+	if (err < 0)
+		goto err_free_pp;
+
+	err = xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, MEM_TYPE_PAGE_POOL,
+					 rxq->page_pool);
+	if (err)
+		goto err_unregister_rxq;
+
+	return 0;
+
+err_unregister_rxq:
+	xdp_rxq_info_unreg(&rxq->xdp_rxq);
+err_free_pp:
+	page_pool_destroy(rxq->page_pool);
+	rxq->page_pool = NULL;
+	return err;
+}
+
 static struct bufdesc *
 fec_enet_txq_submit_frag_skb(struct fec_enet_priv_tx_q *txq,
 			     struct sk_buff *skb,
@@ -1285,7 +1335,6 @@  fec_stop(struct net_device *ndev)
 	}
 }
 
-
 static void
 fec_timeout(struct net_device *ndev, unsigned int txqueue)
 {
@@ -1450,7 +1499,7 @@  static void fec_enet_tx(struct net_device *ndev)
 		fec_enet_tx_queue(ndev, i);
 }
 
-static int
+static int __maybe_unused
 fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff *skb)
 {
 	struct  fec_enet_private *fep = netdev_priv(ndev);
@@ -1470,8 +1519,9 @@  fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff
 	return 0;
 }
 
-static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
-			       struct bufdesc *bdp, u32 length, bool swap)
+static bool __maybe_unused
+fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
+		   struct bufdesc *bdp, u32 length, bool swap)
 {
 	struct  fec_enet_private *fep = netdev_priv(ndev);
 	struct sk_buff *new_skb;
@@ -1496,6 +1546,78 @@  static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
 	return true;
 }
 
+static void fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq,
+				struct bufdesc *bdp, int index)
+{
+	struct page *new_page;
+	dma_addr_t phys_addr;
+
+	new_page = page_pool_dev_alloc_pages(rxq->page_pool);
+	WARN_ON(!new_page);
+	rxq->rx_skb_info[index].page = new_page;
+
+	rxq->rx_skb_info[index].offset = FEC_ENET_XDP_HEADROOM;
+	phys_addr = page_pool_get_dma_addr(new_page) + FEC_ENET_XDP_HEADROOM;
+	bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
+}
+
+static u32
+fec_enet_run_xdp(struct fec_enet_private *fep, struct bpf_prog *prog,
+		 struct xdp_buff *xdp, struct fec_enet_priv_rx_q *rxq, int index)
+{
+	unsigned int sync, len = xdp->data_end - xdp->data;
+	u32 ret = FEC_ENET_XDP_PASS;
+	struct page *page;
+	int err;
+	u32 act;
+
+	act = bpf_prog_run_xdp(prog, xdp);
+
+	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
+	sync = xdp->data_end - xdp->data_hard_start - FEC_ENET_XDP_HEADROOM;
+	sync = max(sync, len);
+
+	switch (act) {
+	case XDP_PASS:
+		rxq->stats.xdp_pass++;
+		ret = FEC_ENET_XDP_PASS;
+		break;
+
+	case XDP_TX:
+		rxq->stats.xdp_tx++;
+		bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
+		fallthrough;
+
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(fep->netdev, xdp, prog);
+		rxq->stats.xdp_redirect++;
+		if (!err) {
+			ret = FEC_ENET_XDP_REDIR;
+		} else {
+			ret = FEC_ENET_XDP_CONSUMED;
+			page = virt_to_head_page(xdp->data);
+			page_pool_put_page(rxq->page_pool, page, sync, true);
+		}
+		break;
+
+	default:
+		bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
+		fallthrough;
+
+	case XDP_ABORTED:
+		fallthrough;    /* handle aborts by dropping packet */
+
+	case XDP_DROP:
+		rxq->stats.xdp_drop++;
+		ret = FEC_ENET_XDP_CONSUMED;
+		page = virt_to_head_page(xdp->data);
+		page_pool_put_page(rxq->page_pool, page, sync, true);
+		break;
+	}
+
+	return ret;
+}
+
 /* During a receive, the bd_rx.cur points to the current incoming buffer.
  * When we update through the ring, if the next incoming buffer has
  * not been given to the system, we just set the empty indicator,
@@ -1508,7 +1630,6 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 	struct fec_enet_priv_rx_q *rxq;
 	struct bufdesc *bdp;
 	unsigned short status;
-	struct  sk_buff *skb_new = NULL;
 	struct  sk_buff *skb;
 	ushort	pkt_len;
 	__u8 *data;
@@ -1517,8 +1638,12 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 	bool	vlan_packet_rcvd = false;
 	u16	vlan_tag;
 	int	index = 0;
-	bool	is_copybreak;
 	bool	need_swap = fep->quirks & FEC_QUIRK_SWAP_FRAME;
+	struct page *page;
+	struct xdp_buff xdp;
+	u32 ret, xdp_result = FEC_ENET_XDP_PASS;
+
+	struct bpf_prog *xdp_prog = READ_ONCE(fep->xdp_prog);
 
 #ifdef CONFIG_M532x
 	flush_cache_all();
@@ -1529,6 +1654,7 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 	 * These get messed up if we get called due to a busy condition.
 	 */
 	bdp = rxq->bd.cur;
+	xdp_init_buff(&xdp, PAGE_SIZE, &rxq->xdp_rxq);
 
 	while (!((status = fec16_to_cpu(bdp->cbd_sc)) & BD_ENET_RX_EMPTY)) {
 
@@ -1570,31 +1696,37 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 		ndev->stats.rx_bytes += pkt_len;
 
 		index = fec_enet_get_bd_index(bdp, &rxq->bd);
-		skb = rxq->rx_skbuff[index];
+		page = rxq->rx_skb_info[index].page;
+
+		dma_sync_single_for_cpu(&fep->pdev->dev,
+					fec32_to_cpu(bdp->cbd_bufaddr),
+					pkt_len,
+					DMA_FROM_DEVICE);
+
+		prefetch(page_address(page));
+		fec_enet_update_cbd(rxq, bdp, index);
+
+		if (xdp_prog) {
+			xdp_prepare_buff(&xdp, page_address(page),
+					 FEC_ENET_XDP_HEADROOM, pkt_len, false);
+
+			ret = fec_enet_run_xdp(fep, xdp_prog, &xdp, rxq, index);
+			xdp_result |= ret;
+			if (ret != FEC_ENET_XDP_PASS)
+				goto rx_processing_done;
+		}
 
 		/* The packet length includes FCS, but we don't want to
 		 * include that when passing upstream as it messes up
 		 * bridging applications.
 		 */
-		is_copybreak = fec_enet_copybreak(ndev, &skb, bdp, pkt_len - 4,
-						  need_swap);
-		if (!is_copybreak) {
-			skb_new = netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE);
-			if (unlikely(!skb_new)) {
-				ndev->stats.rx_dropped++;
-				goto rx_processing_done;
-			}
-			dma_unmap_single(&fep->pdev->dev,
-					 fec32_to_cpu(bdp->cbd_bufaddr),
-					 FEC_ENET_RX_FRSIZE - fep->rx_align,
-					 DMA_FROM_DEVICE);
-		}
-
-		prefetch(skb->data - NET_IP_ALIGN);
+		skb = build_skb(page_address(page), FEC_ENET_RX_FRSIZE);
+		skb_reserve(skb, FEC_ENET_XDP_HEADROOM);
 		skb_put(skb, pkt_len - 4);
 		data = skb->data;
+		page_pool_release_page(rxq->page_pool, page);
 
-		if (!is_copybreak && need_swap)
+		if (need_swap)
 			swap_buffer(data, pkt_len);
 
 #if !defined(CONFIG_M5272)
@@ -1649,16 +1781,6 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 		skb_record_rx_queue(skb, queue_id);
 		napi_gro_receive(&fep->napi, skb);
 
-		if (is_copybreak) {
-			dma_sync_single_for_device(&fep->pdev->dev,
-						   fec32_to_cpu(bdp->cbd_bufaddr),
-						   FEC_ENET_RX_FRSIZE - fep->rx_align,
-						   DMA_FROM_DEVICE);
-		} else {
-			rxq->rx_skbuff[index] = skb_new;
-			fec_enet_new_rxbdp(ndev, bdp, skb_new);
-		}
-
 rx_processing_done:
 		/* Clear the status flags for this buffer */
 		status &= ~BD_ENET_RX_STATS;
@@ -1689,6 +1811,10 @@  fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 		writel(0, rxq->bd.reg_desc_active);
 	}
 	rxq->bd.cur = bdp;
+
+	if (xdp_result & FEC_ENET_XDP_REDIR)
+		xdp_do_flush_map();
+
 	return pkt_received;
 }
 
@@ -2584,15 +2710,46 @@  static const struct fec_stat {
 	{ "IEEE_rx_octets_ok", IEEE_R_OCTETS_OK },
 };
 
-#define FEC_STATS_SIZE		(ARRAY_SIZE(fec_stats) * sizeof(u64))
+static struct fec_xdp_stat {
+	char name[ETH_GSTRING_LEN];
+	u32 count;
+} fec_xdp_stats[] = {
+	{ "rx_xdp_redirect", 0 },
+	{ "rx_xdp_pass", 0 },
+	{ "rx_xdp_drop", 0 },
+	{ "rx_xdp_tx", 0 },
+	{ "rx_xdp_tx_errors", 0 },
+	{ "tx_xdp_xmit", 0 },
+	{ "tx_xdp_xmit_errors", 0 },
+};
+
+#define FEC_STATS_SIZE	((ARRAY_SIZE(fec_stats) + \
+			ARRAY_SIZE(fec_xdp_stats)) * sizeof(u64))
 
 static void fec_enet_update_ethtool_stats(struct net_device *dev)
 {
 	struct fec_enet_private *fep = netdev_priv(dev);
+	struct fec_enet_priv_rx_q *rxq;
+	struct fec_xdp_stat xdp_stats[7];
+	int off = ARRAY_SIZE(fec_stats);
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(fec_stats); i++)
 		fep->ethtool_stats[i] = readl(fep->hwp + fec_stats[i].offset);
+
+	for (i = fep->num_rx_queues - 1; i >= 0; i--) {
+		rxq = fep->rx_queue[i];
+		xdp_stats[0].count += rxq->stats.xdp_redirect;
+		xdp_stats[1].count += rxq->stats.xdp_pass;
+		xdp_stats[2].count += rxq->stats.xdp_drop;
+		xdp_stats[3].count += rxq->stats.xdp_tx;
+		xdp_stats[4].count += rxq->stats.xdp_tx_err;
+		xdp_stats[5].count += rxq->stats.xdp_xmit;
+		xdp_stats[6].count += rxq->stats.xdp_xmit_err;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(fec_xdp_stats); i++)
+		fep->ethtool_stats[i + off] = xdp_stats[i].count;
 }
 
 static void fec_enet_get_ethtool_stats(struct net_device *dev,
@@ -2609,12 +2766,16 @@  static void fec_enet_get_ethtool_stats(struct net_device *dev,
 static void fec_enet_get_strings(struct net_device *netdev,
 	u32 stringset, u8 *data)
 {
+	int off = ARRAY_SIZE(fec_stats);
 	int i;
 	switch (stringset) {
 	case ETH_SS_STATS:
 		for (i = 0; i < ARRAY_SIZE(fec_stats); i++)
 			memcpy(data + i * ETH_GSTRING_LEN,
 				fec_stats[i].name, ETH_GSTRING_LEN);
+		for (i = 0; i < ARRAY_SIZE(fec_xdp_stats); i++)
+			memcpy(data + (i + off) * ETH_GSTRING_LEN,
+			       fec_xdp_stats[i].name, ETH_GSTRING_LEN);
 		break;
 	case ETH_SS_TEST:
 		net_selftest_get_strings(data);
@@ -2626,7 +2787,7 @@  static int fec_enet_get_sset_count(struct net_device *dev, int sset)
 {
 	switch (sset) {
 	case ETH_SS_STATS:
-		return ARRAY_SIZE(fec_stats);
+		return ARRAY_SIZE(fec_stats) + ARRAY_SIZE(fec_xdp_stats);
 	case ETH_SS_TEST:
 		return net_selftest_get_count();
 	default:
@@ -2645,6 +2806,8 @@  static void fec_enet_clear_ethtool_stats(struct net_device *dev)
 	for (i = 0; i < ARRAY_SIZE(fec_stats); i++)
 		writel(0, fep->hwp + fec_stats[i].offset);
 
+	for (i = 0; i < ARRAY_SIZE(fec_xdp_stats); i++)
+		fec_xdp_stats[i].count = 0;
 	/* Don't disable MIB statistics counters */
 	writel(0, fep->hwp + FEC_MIB_CTRLSTAT);
 }
@@ -3011,17 +3174,14 @@  static void fec_enet_free_buffers(struct net_device *ndev)
 		rxq = fep->rx_queue[q];
 		bdp = rxq->bd.base;
 		for (i = 0; i < rxq->bd.ring_size; i++) {
-			skb = rxq->rx_skbuff[i];
-			rxq->rx_skbuff[i] = NULL;
-			if (skb) {
-				dma_unmap_single(&fep->pdev->dev,
-						 fec32_to_cpu(bdp->cbd_bufaddr),
-						 FEC_ENET_RX_FRSIZE - fep->rx_align,
-						 DMA_FROM_DEVICE);
-				dev_kfree_skb(skb);
-			}
+			page_pool_release_page(rxq->page_pool, rxq->rx_skb_info[i].page);
 			bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
 		}
+
+		if (xdp_rxq_info_is_reg(&rxq->xdp_rxq))
+			xdp_rxq_info_unreg(&rxq->xdp_rxq);
+		page_pool_destroy(rxq->page_pool);
+		rxq->page_pool = NULL;
 	}
 
 	for (q = 0; q < fep->num_tx_queues; q++) {
@@ -3111,24 +3271,32 @@  static int
 fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
-	unsigned int i;
-	struct sk_buff *skb;
+	unsigned int i, err;
 	struct bufdesc	*bdp;
 	struct fec_enet_priv_rx_q *rxq;
 
+	dma_addr_t phys_addr;
+	struct page *page;
+
 	rxq = fep->rx_queue[queue];
 	bdp = rxq->bd.base;
+
+	err = fec_enet_create_page_pool(fep, rxq, rxq->bd.ring_size);
+	if (err < 0) {
+		netdev_err(ndev, "%s failed queue %d (%d)\n", __func__, queue, err);
+		return err;
+	}
+
 	for (i = 0; i < rxq->bd.ring_size; i++) {
-		skb = __netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE, GFP_KERNEL);
-		if (!skb)
+		page = page_pool_dev_alloc_pages(rxq->page_pool);
+		if (!page)
 			goto err_alloc;
 
-		if (fec_enet_new_rxbdp(ndev, bdp, skb)) {
-			dev_kfree_skb(skb);
-			goto err_alloc;
-		}
+		phys_addr = page_pool_get_dma_addr(page) + FEC_ENET_XDP_HEADROOM;
+		bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
 
-		rxq->rx_skbuff[i] = skb;
+		rxq->rx_skb_info[i].page = page;
+		rxq->rx_skb_info[i].offset = FEC_ENET_XDP_HEADROOM;
 		bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY);
 
 		if (fep->bufdesc_ex) {
@@ -3490,6 +3658,144 @@  static u16 fec_enet_select_queue(struct net_device *ndev, struct sk_buff *skb,
 	return fec_enet_vlan_pri_to_queue[vlan_tag >> 13];
 }
 
+static int fec_enet_bpf(struct net_device *dev, struct netdev_bpf *bpf)
+{
+	struct fec_enet_private *fep = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	bool is_run = netif_running(dev);
+
+	switch (bpf->command) {
+	case XDP_SETUP_PROG:
+		if (is_run)
+			fec_enet_close(dev);
+		old_prog = xchg(&fep->xdp_prog, bpf->prog);
+
+		if (is_run)
+			fec_enet_open(dev);
+
+		if (old_prog)
+			bpf_prog_put(old_prog);
+
+		return 0;
+
+	case XDP_SETUP_XSK_POOL:
+		return -EOPNOTSUPP;
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+fec_enet_xdp_get_tx_queue(struct fec_enet_private *fep, int cpu)
+{
+	int index = cpu;
+
+	if (unlikely(index < 0))
+		index = 0;
+
+	while (index >= fep->num_tx_queues)
+		index -= fep->num_tx_queues;
+
+	return index;
+}
+
+static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep,
+				   struct fec_enet_priv_tx_q *txq,
+				   struct xdp_frame *frame)
+{
+	struct bufdesc *bdp, *last_bdp;
+	dma_addr_t dma_addr;
+	unsigned int index, status, estatus;
+	int entries_free;
+
+	entries_free = fec_enet_get_free_txdesc_num(txq);
+	if (entries_free < MAX_SKB_FRAGS + 1) {
+		netdev_err(fep->netdev, "NOT enough BD for SG!\n");
+		return NETDEV_TX_OK;
+	}
+
+	/* Fill in a Tx ring entry */
+	bdp = txq->bd.cur;
+	last_bdp = bdp;
+	status = fec16_to_cpu(bdp->cbd_sc);
+	status &= ~BD_ENET_TX_STATS;
+
+	index = fec_enet_get_bd_index(bdp, &txq->bd);
+
+	dma_addr = dma_map_single(&fep->pdev->dev, frame->data,
+				  frame->len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&fep->pdev->dev, dma_addr))
+		return FEC_ENET_XDP_CONSUMED;
+
+	status |= (BD_ENET_TX_INTR | BD_ENET_TX_LAST);
+	if (fep->bufdesc_ex)
+		estatus = BD_ENET_TX_INT;
+
+	bdp->cbd_bufaddr = cpu_to_fec32(dma_addr);
+	bdp->cbd_datlen = cpu_to_fec16(frame->len);
+
+	if (fep->bufdesc_ex) {
+		struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+
+		if (fep->quirks & FEC_QUIRK_HAS_AVB)
+			estatus |= FEC_TX_BD_FTYPE(txq->bd.qid);
+
+		ebdp->cbd_bdu = 0;
+		ebdp->cbd_esc = cpu_to_fec32(estatus);
+	}
+
+	index = fec_enet_get_bd_index(last_bdp, &txq->bd);
+	txq->tx_skbuff[index] = NULL;
+
+	/* Send it on its way.  Tell FEC it's ready, interrupt when done,
+	 * it's the last BD of the frame, and to put the CRC on the end.
+	 */
+	status |= (BD_ENET_TX_READY | BD_ENET_TX_TC);
+	bdp->cbd_sc = cpu_to_fec16(status);
+
+	/* If this was the last BD in the ring, start at the beginning again. */
+	bdp = fec_enet_get_nextdesc(last_bdp, &txq->bd);
+
+	txq->bd.cur = bdp;
+
+	return 0;
+}
+
+static int fec_enet_xdp_xmit(struct net_device *dev,
+			     int num_frames,
+			     struct xdp_frame **frames,
+			     u32 flags)
+{
+	struct fec_enet_private *fep = netdev_priv(dev);
+	struct fec_enet_priv_tx_q *txq;
+	int cpu = smp_processor_id();
+	struct netdev_queue *nq;
+	unsigned int queue;
+	int i, nxmit = 0;
+
+	queue = fec_enet_xdp_get_tx_queue(fep, cpu);
+	txq = fep->tx_queue[queue];
+	nq = netdev_get_tx_queue(fep->netdev, queue);
+
+	__netif_tx_lock(nq, cpu);
+
+	for (i = 0; i < num_frames; i++) {
+		fec_enet_txq_xmit_frame(fep, txq, frames[i]);
+		nxmit++;
+	}
+
+	/* Make sure the update to bdp and tx_skbuff are performed. */
+	wmb();
+
+	/* Trigger transmission start */
+	writel(0, txq->bd.reg_desc_active);
+
+	__netif_tx_unlock(nq);
+
+	return num_frames;
+}
+
 static const struct net_device_ops fec_netdev_ops = {
 	.ndo_open		= fec_enet_open,
 	.ndo_stop		= fec_enet_close,
@@ -3504,6 +3810,8 @@  static const struct net_device_ops fec_netdev_ops = {
 	.ndo_poll_controller	= fec_poll_controller,
 #endif
 	.ndo_set_features	= fec_set_features,
+	.ndo_bpf		= fec_enet_bpf,
+	.ndo_xdp_xmit		= fec_enet_xdp_xmit,
 };
 
 static const unsigned short offset_des_active_rxq[] = {