Message ID | 20230526054621.18371-2-liangchen.linux@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next,1/5] virtio_net: Fix an unsafe reference to the page chain | expand |
On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > The implementation at the moment uses one page per packet in both the > normal and XDP path. It's better to explain why we need a page pool and how it can help the performance. > In addition, introducing a module parameter to enable > or disable the usage of page pool (disabled by default). If page pool wins for most of the cases, any reason to disable it by default? > > In single-core vm testing environments, it gives a modest performance gain > in the normal path. > Upstream codebase: 47.5 Gbits/sec > Upstream codebase + page_pool support: 50.2 Gbits/sec > > In multi-core vm testing environments, The most significant performance > gain is observed in XDP cpumap: > Upstream codebase: 1.38 Gbits/sec > Upstream codebase + page_pool support: 9.74 Gbits/sec Please show more details on the test. E.g which kinds of tests have you measured? Btw, it would be better to measure PPS as well. > > With this foundation, we can further integrate page pool fragmentation and > DMA map/unmap support. > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > --- > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- I believe we should make virtio-net to select CONFIG_PAGE_POOL or do the ifdef tricks at least. > 1 file changed, 146 insertions(+), 42 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index c5dca0d92e64..99c0ca0c1781 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > module_param(gso, bool, 0444); > module_param(napi_tx, bool, 0644); > > +static bool page_pool_enabled; > +module_param(page_pool_enabled, bool, 0400); > + > /* FIXME: MTU in config. */ > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > #define GOOD_COPY_LEN 128 > @@ -159,6 +162,9 @@ struct receive_queue { > /* Chain pages by the private ptr. */ > struct page *pages; > > + /* Page pool */ > + struct page_pool *page_pool; > + > /* Average packet length for mergeable receive buffers. */ > struct ewma_pkt_len mrg_avg_pkt_len; > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > return skb; > } > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > +{ > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + put_page(page); > +} > + > /* Called from bottom half context */ > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > struct receive_queue *rq, > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > hdr = skb_vnet_hdr(skb); > memcpy(hdr, hdr_p, hdr_len); > if (page_to_free) > - put_page(page_to_free); > + virtnet_put_page(rq, page_to_free); > > return skb; > } > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > return ret; > } > > -static void put_xdp_frags(struct xdp_buff *xdp) > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > { rq could be fetched from xdp_rxq_info? > struct skb_shared_info *shinfo; > struct page *xdp_page; > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > shinfo = xdp_get_shared_info_from_buff(xdp); > for (i = 0; i < shinfo->nr_frags; i++) { > xdp_page = skb_frag_page(&shinfo->frags[i]); > - put_page(xdp_page); > + virtnet_put_page(rq, xdp_page); > } > } > } > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > if (page_off + *len + tailroom > PAGE_SIZE) > return NULL; > > - page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + page = alloc_page(GFP_ATOMIC); > + > if (!page) > return NULL; > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > * is sending packet larger than the MTU. > */ > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > - put_page(p); > + virtnet_put_page(rq, p); > goto err_buf; > } > > memcpy(page_address(page) + page_off, > page_address(p) + off, buflen); > page_off += buflen; > - put_page(p); > + virtnet_put_page(rq, p); > } > > /* Headroom does not contribute to packet length */ > *len = page_off - VIRTIO_XDP_HEADROOM; > return page; > err_buf: > - __free_pages(page, 0); > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + __free_pages(page, 0); > return NULL; > } > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > } > stats->bytes += len; > page = virt_to_head_page(buf); > - put_page(page); > + virtnet_put_page(rq, page); > } > } > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > cur_frag_size = truesize; > xdp_frags_truesz += cur_frag_size; > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > - put_page(page); > + virtnet_put_page(rq, page); > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > dev->name, len, (unsigned long)(truesize - room)); > dev->stats.rx_length_errors++; > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > return 0; > > err: > - put_xdp_frags(xdp); > + put_xdp_frags(xdp, rq); > return -EINVAL; > } > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > if (*len + xdp_room > PAGE_SIZE) > return NULL; > > - xdp_page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + xdp_page = alloc_page(GFP_ATOMIC); > if (!xdp_page) > return NULL; > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > *frame_sz = PAGE_SIZE; > > - put_page(*page); > + virtnet_put_page(rq, *page); > > *page = xdp_page; > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > if (unlikely(!head_skb)) > break; > + if (rq->page_pool) > + skb_mark_for_recycle(head_skb); > return head_skb; > > case XDP_TX: > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > break; > } > > - put_xdp_frags(&xdp); > + put_xdp_frags(&xdp, rq); > > err_xdp: > - put_page(page); > + virtnet_put_page(rq, page); > mergeable_buf_free(rq, num_buf, dev, stats); > > stats->xdp_drops++; > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > curr_skb = head_skb; > > + if (rq->page_pool) > + skb_mark_for_recycle(curr_skb); > + > if (unlikely(!curr_skb)) > goto err_skb; > while (--num_buf) { > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > curr_skb = nskb; > head_skb->truesize += nskb->truesize; > num_skb_frags = 0; > + if (rq->page_pool) > + skb_mark_for_recycle(curr_skb); > } > if (curr_skb != head_skb) { > head_skb->data_len += len; > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > } > offset = buf - page_address(page); > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > - put_page(page); > + virtnet_put_page(rq, page); I wonder why not we can't do this during buffer allocation like other drivers? > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > len, truesize); > } else { > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > return head_skb; > > err_skb: > - put_page(page); > + virtnet_put_page(rq, page); > mergeable_buf_free(rq, num_buf, dev, stats); > > err_buf: > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > * disabled GSO for XDP, it won't be a big issue. > */ > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > - return -ENOMEM; > + if (rq->page_pool) { > + struct page *page; > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > - buf += headroom; /* advance address leaving hole at front of pkt */ > - get_page(alloc_frag->page); > - alloc_frag->offset += len + room; > - hole = alloc_frag->size - alloc_frag->offset; > - if (hole < len + room) { > - /* To avoid internal fragmentation, if there is very likely not > - * enough space for another buffer, add the remaining space to > - * the current buffer. > - * XDP core assumes that frame_size of xdp_buff and the length > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > - */ > - if (!headroom) > - len += hole; > - alloc_frag->offset += hole; > - } > + page = page_pool_dev_alloc_pages(rq->page_pool); > + if (unlikely(!page)) > + return -ENOMEM; > + buf = (char *)page_address(page); > + buf += headroom; /* advance address leaving hole at front of pkt */ > + } else { > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) Why not simply use a helper like virtnet_page_frag_refill() and add the page_pool allocation logic there? It helps to reduce the changeset. > + return -ENOMEM; > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > + buf += headroom; /* advance address leaving hole at front of pkt */ > + get_page(alloc_frag->page); > + alloc_frag->offset += len + room; > + hole = alloc_frag->size - alloc_frag->offset; > + if (hole < len + room) { > + /* To avoid internal fragmentation, if there is very likely not > + * enough space for another buffer, add the remaining space to > + * the current buffer. > + * XDP core assumes that frame_size of xdp_buff and the length > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > + */ > + if (!headroom) > + len += hole; > + alloc_frag->offset += hole; > + } > + } > sg_init_one(rq->sg, buf, len); > ctx = mergeable_len_to_ctx(len + room, headroom); > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > if (err < 0) > - put_page(virt_to_head_page(buf)); > + virtnet_put_page(rq, virt_to_head_page(buf)); > > return err; > } > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > if (err < 0) > return err; > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > - MEM_TYPE_PAGE_SHARED, NULL); > + if (vi->rq[qp_index].page_pool) > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > + MEM_TYPE_PAGE_POOL, > + vi->rq[qp_index].page_pool); > + else > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > + MEM_TYPE_PAGE_SHARED, > + NULL); > + > if (err < 0) > goto err_xdp_reg_mem_model; > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > virtnet_sq_stats_desc[j].desc); > } > + page_pool_ethtool_stats_get_strings(p); > break; > } > } > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > switch (sset) { > case ETH_SS_STATS: > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > - VIRTNET_SQ_STATS_LEN); > + VIRTNET_SQ_STATS_LEN + > + (page_pool_enabled && vi->mergeable_rx_bufs ? > + page_pool_ethtool_stats_get_count() : 0)); > default: > return -EOPNOTSUPP; > } > } > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > +{ > +#ifdef CONFIG_PAGE_POOL_STATS > + struct virtnet_info *vi = netdev_priv(dev); > + struct page_pool_stats pp_stats = {}; > + int i; > + > + for (i = 0; i < vi->curr_queue_pairs; i++) { > + if (!vi->rq[i].page_pool) > + continue; > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > + } > + page_pool_ethtool_stats_get(data, &pp_stats); > +#endif /* CONFIG_PAGE_POOL_STATS */ > +} > + > static void virtnet_get_ethtool_stats(struct net_device *dev, > struct ethtool_stats *stats, u64 *data) > { > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > idx += VIRTNET_SQ_STATS_LEN; > } > + > + virtnet_get_page_pool_stats(dev, &data[idx]); > } > > static void virtnet_get_channels(struct net_device *dev, > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > for (i = 0; i < vi->max_queue_pairs; i++) { > __netif_napi_del(&vi->rq[i].napi); > __netif_napi_del(&vi->sq[i].napi); > + if (vi->rq[i].page_pool) > + page_pool_destroy(vi->rq[i].page_pool); > } > > /* We called __netif_napi_del(), > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > struct virtnet_info *vi = vq->vdev->priv; > int i = vq2rxq(vq); > > - if (vi->mergeable_rx_bufs) > - put_page(virt_to_head_page(buf)); > - else if (vi->big_packets) > + if (vi->mergeable_rx_bufs) { > + if (vi->rq[i].page_pool) { > + page_pool_put_full_page(vi->rq[i].page_pool, > + virt_to_head_page(buf), > + true); > + } else { > + put_page(virt_to_head_page(buf)); > + } > + } else if (vi->big_packets) { > give_pages(&vi->rq[i], buf); Any reason only mergeable were modified but not for small and big? Thanks > - else > + } else { > put_page(virt_to_head_page(buf)); > + } > } > > static void free_unused_bufs(struct virtnet_info *vi) > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > virtnet_free_queues(vi); > } > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > +{ > + struct virtio_device *vdev = rq->vq->vdev; > + > + struct page_pool_params pp_params = { > + .order = 0, > + .pool_size = rq->vq->num_max, > + .nid = dev_to_node(vdev->dev.parent), > + .dev = vdev->dev.parent, > + .offset = 0, > + }; > + > + rq->page_pool = page_pool_create(&pp_params); > + if (IS_ERR(rq->page_pool)) { > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > + PTR_ERR(rq->page_pool)); > + rq->page_pool = NULL; > + } > +} > + > /* How large should a single buffer be so a queue full of these can fit at > * least one full packet? > * Logic below assumes the mergeable buffer header is used. > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > vi->rq[i].vq = vqs[rxq2vq(i)]; > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > vi->sq[i].vq = vqs[txq2vq(i)]; > + > + if (page_pool_enabled && vi->mergeable_rx_bufs) > + virtnet_alloc_page_pool(&vi->rq[i]); > + else > + dev_warn(&vi->vdev->dev, > + "page pool only support mergeable mode\n"); > + > } > > /* run here: ret == 0. */ > -- > 2.31.1 >
Hi Liang, kernel test robot noticed the following build errors: [auto build test ERROR on net-next/main] url: https://github.com/intel-lab-lkp/linux/commits/Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805 base: net-next/main patch link: https://lore.kernel.org/r/20230526054621.18371-2-liangchen.linux%40gmail.com patch subject: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20230526/202305262334.GiFQ3wpG-lkp@intel.com/config) compiler: gcc-11 (Debian 11.3.0-12) 11.3.0 reproduce (this is a W=1 build): # https://github.com/intel-lab-lkp/linux/commit/bfba563f43bba37181d8502cb2e566c32f96ec9e git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805 git checkout bfba563f43bba37181d8502cb2e566c32f96ec9e # save the config file mkdir build_dir && cp config build_dir/.config make W=1 O=build_dir ARCH=x86_64 olddefconfig make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202305262334.GiFQ3wpG-lkp@intel.com/ All errors (new ones prefixed by >>): ld: vmlinux.o: in function `virtnet_find_vqs': >> virtio_net.c:(.text+0x901fb5): undefined reference to `page_pool_create' ld: vmlinux.o: in function `add_recvbuf_mergeable.isra.0': >> virtio_net.c:(.text+0x905618): undefined reference to `page_pool_alloc_pages' ld: vmlinux.o: in function `xdp_linearize_page': virtio_net.c:(.text+0x906b6b): undefined reference to `page_pool_alloc_pages' ld: vmlinux.o: in function `mergeable_xdp_get_buf.isra.0': virtio_net.c:(.text+0x90728f): undefined reference to `page_pool_alloc_pages'
On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote: > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > The implementation at the moment uses one page per packet in both the > > normal and XDP path. > > It's better to explain why we need a page pool and how it can help the > performance. > Sure, I will include that on v2. > > In addition, introducing a module parameter to enable > > or disable the usage of page pool (disabled by default). > > If page pool wins for most of the cases, any reason to disable it by default? > Thank you for raising the point. It does make sense to enable it by default. > > > > In single-core vm testing environments, it gives a modest performance gain > > in the normal path. > > Upstream codebase: 47.5 Gbits/sec > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > In multi-core vm testing environments, The most significant performance > > gain is observed in XDP cpumap: > > Upstream codebase: 1.38 Gbits/sec > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > Please show more details on the test. E.g which kinds of tests have > you measured? > > Btw, it would be better to measure PPS as well. > Sure. It will be added on v2. > > > > With this foundation, we can further integrate page pool fragmentation and > > DMA map/unmap support. > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > --- > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do > the ifdef tricks at least. > Sure. it will be done on v2. > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index c5dca0d92e64..99c0ca0c1781 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > module_param(gso, bool, 0444); > > module_param(napi_tx, bool, 0644); > > > > +static bool page_pool_enabled; > > +module_param(page_pool_enabled, bool, 0400); > > + > > /* FIXME: MTU in config. */ > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > #define GOOD_COPY_LEN 128 > > @@ -159,6 +162,9 @@ struct receive_queue { > > /* Chain pages by the private ptr. */ > > struct page *pages; > > > > + /* Page pool */ > > + struct page_pool *page_pool; > > + > > /* Average packet length for mergeable receive buffers. */ > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > return skb; > > } > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > +{ > > + if (rq->page_pool) > > + page_pool_put_full_page(rq->page_pool, page, true); > > + else > > + put_page(page); > > +} > > + > > /* Called from bottom half context */ > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > struct receive_queue *rq, > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > hdr = skb_vnet_hdr(skb); > > memcpy(hdr, hdr_p, hdr_len); > > if (page_to_free) > > - put_page(page_to_free); > > + virtnet_put_page(rq, page_to_free); > > > > return skb; > > } > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > return ret; > > } > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > { > > rq could be fetched from xdp_rxq_info? Yeah, it has the queue_index there. > > > struct skb_shared_info *shinfo; > > struct page *xdp_page; > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > shinfo = xdp_get_shared_info_from_buff(xdp); > > for (i = 0; i < shinfo->nr_frags; i++) { > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > - put_page(xdp_page); > > + virtnet_put_page(rq, xdp_page); > > } > > } > > } > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > if (page_off + *len + tailroom > PAGE_SIZE) > > return NULL; > > > > - page = alloc_page(GFP_ATOMIC); > > + if (rq->page_pool) > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > + else > > + page = alloc_page(GFP_ATOMIC); > > + > > if (!page) > > return NULL; > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > * is sending packet larger than the MTU. > > */ > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > - put_page(p); > > + virtnet_put_page(rq, p); > > goto err_buf; > > } > > > > memcpy(page_address(page) + page_off, > > page_address(p) + off, buflen); > > page_off += buflen; > > - put_page(p); > > + virtnet_put_page(rq, p); > > } > > > > /* Headroom does not contribute to packet length */ > > *len = page_off - VIRTIO_XDP_HEADROOM; > > return page; > > err_buf: > > - __free_pages(page, 0); > > + if (rq->page_pool) > > + page_pool_put_full_page(rq->page_pool, page, true); > > + else > > + __free_pages(page, 0); > > return NULL; > > } > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > } > > stats->bytes += len; > > page = virt_to_head_page(buf); > > - put_page(page); > > + virtnet_put_page(rq, page); > > } > > } > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > cur_frag_size = truesize; > > xdp_frags_truesz += cur_frag_size; > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > - put_page(page); > > + virtnet_put_page(rq, page); > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > dev->name, len, (unsigned long)(truesize - room)); > > dev->stats.rx_length_errors++; > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > return 0; > > > > err: > > - put_xdp_frags(xdp); > > + put_xdp_frags(xdp, rq); > > return -EINVAL; > > } > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > if (*len + xdp_room > PAGE_SIZE) > > return NULL; > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > + if (rq->page_pool) > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > + else > > + xdp_page = alloc_page(GFP_ATOMIC); > > if (!xdp_page) > > return NULL; > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > *frame_sz = PAGE_SIZE; > > > > - put_page(*page); > > + virtnet_put_page(rq, *page); > > > > *page = xdp_page; > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > if (unlikely(!head_skb)) > > break; > > + if (rq->page_pool) > > + skb_mark_for_recycle(head_skb); > > return head_skb; > > > > case XDP_TX: > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > break; > > } > > > > - put_xdp_frags(&xdp); > > + put_xdp_frags(&xdp, rq); > > > > err_xdp: > > - put_page(page); > > + virtnet_put_page(rq, page); > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > stats->xdp_drops++; > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > curr_skb = head_skb; > > > > + if (rq->page_pool) > > + skb_mark_for_recycle(curr_skb); > > + > > if (unlikely(!curr_skb)) > > goto err_skb; > > while (--num_buf) { > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > curr_skb = nskb; > > head_skb->truesize += nskb->truesize; > > num_skb_frags = 0; > > + if (rq->page_pool) > > + skb_mark_for_recycle(curr_skb); > > } > > if (curr_skb != head_skb) { > > head_skb->data_len += len; > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > } > > offset = buf - page_address(page); > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > - put_page(page); > > + virtnet_put_page(rq, page); > > I wonder why not we can't do this during buffer allocation like other drivers? > Sorry, I don't quite understand the point here. Would you please elaborate a bit more? > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > len, truesize); > > } else { > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > return head_skb; > > > > err_skb: > > - put_page(page); > > + virtnet_put_page(rq, page); > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > err_buf: > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > * disabled GSO for XDP, it won't be a big issue. > > */ > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > - return -ENOMEM; > > + if (rq->page_pool) { > > + struct page *page; > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > - get_page(alloc_frag->page); > > - alloc_frag->offset += len + room; > > - hole = alloc_frag->size - alloc_frag->offset; > > - if (hole < len + room) { > > - /* To avoid internal fragmentation, if there is very likely not > > - * enough space for another buffer, add the remaining space to > > - * the current buffer. > > - * XDP core assumes that frame_size of xdp_buff and the length > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > - */ > > - if (!headroom) > > - len += hole; > > - alloc_frag->offset += hole; > > - } > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > + if (unlikely(!page)) > > + return -ENOMEM; > > + buf = (char *)page_address(page); > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > + } else { > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > Why not simply use a helper like virtnet_page_frag_refill() and add > the page_pool allocation logic there? It helps to reduce the > changeset. > Sure. Will do that on v2. > > + return -ENOMEM; > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > + get_page(alloc_frag->page); > > + alloc_frag->offset += len + room; > > + hole = alloc_frag->size - alloc_frag->offset; > > + if (hole < len + room) { > > + /* To avoid internal fragmentation, if there is very likely not > > + * enough space for another buffer, add the remaining space to > > + * the current buffer. > > + * XDP core assumes that frame_size of xdp_buff and the length > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > + */ > > + if (!headroom) > > + len += hole; > > + alloc_frag->offset += hole; > > + } > > + } > > sg_init_one(rq->sg, buf, len); > > ctx = mergeable_len_to_ctx(len + room, headroom); > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > if (err < 0) > > - put_page(virt_to_head_page(buf)); > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > return err; > > } > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > if (err < 0) > > return err; > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > - MEM_TYPE_PAGE_SHARED, NULL); > > + if (vi->rq[qp_index].page_pool) > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > + MEM_TYPE_PAGE_POOL, > > + vi->rq[qp_index].page_pool); > > + else > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > + MEM_TYPE_PAGE_SHARED, > > + NULL); > > + > > if (err < 0) > > goto err_xdp_reg_mem_model; > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > virtnet_sq_stats_desc[j].desc); > > } > > + page_pool_ethtool_stats_get_strings(p); > > break; > > } > > } > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > switch (sset) { > > case ETH_SS_STATS: > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > - VIRTNET_SQ_STATS_LEN); > > + VIRTNET_SQ_STATS_LEN + > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > + page_pool_ethtool_stats_get_count() : 0)); > > default: > > return -EOPNOTSUPP; > > } > > } > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > +{ > > +#ifdef CONFIG_PAGE_POOL_STATS > > + struct virtnet_info *vi = netdev_priv(dev); > > + struct page_pool_stats pp_stats = {}; > > + int i; > > + > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > + if (!vi->rq[i].page_pool) > > + continue; > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > + } > > + page_pool_ethtool_stats_get(data, &pp_stats); > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > +} > > + > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > struct ethtool_stats *stats, u64 *data) > > { > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > idx += VIRTNET_SQ_STATS_LEN; > > } > > + > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > } > > > > static void virtnet_get_channels(struct net_device *dev, > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > for (i = 0; i < vi->max_queue_pairs; i++) { > > __netif_napi_del(&vi->rq[i].napi); > > __netif_napi_del(&vi->sq[i].napi); > > + if (vi->rq[i].page_pool) > > + page_pool_destroy(vi->rq[i].page_pool); > > } > > > > /* We called __netif_napi_del(), > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > struct virtnet_info *vi = vq->vdev->priv; > > int i = vq2rxq(vq); > > > > - if (vi->mergeable_rx_bufs) > > - put_page(virt_to_head_page(buf)); > > - else if (vi->big_packets) > > + if (vi->mergeable_rx_bufs) { > > + if (vi->rq[i].page_pool) { > > + page_pool_put_full_page(vi->rq[i].page_pool, > > + virt_to_head_page(buf), > > + true); > > + } else { > > + put_page(virt_to_head_page(buf)); > > + } > > + } else if (vi->big_packets) { > > give_pages(&vi->rq[i], buf); > > Any reason only mergeable were modified but not for small and big? > > Thanks > Big mode uses the page chain to recycle pages, thus the using of "private" of the buffer page. I will take further look into that to see if it is better to use page pool in these cases. Thanks! > > - else > > + } else { > > put_page(virt_to_head_page(buf)); > > + } > > } > > > > static void free_unused_bufs(struct virtnet_info *vi) > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > virtnet_free_queues(vi); > > } > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > +{ > > + struct virtio_device *vdev = rq->vq->vdev; > > + > > + struct page_pool_params pp_params = { > > + .order = 0, > > + .pool_size = rq->vq->num_max, > > + .nid = dev_to_node(vdev->dev.parent), > > + .dev = vdev->dev.parent, > > + .offset = 0, > > + }; > > + > > + rq->page_pool = page_pool_create(&pp_params); > > + if (IS_ERR(rq->page_pool)) { > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > + PTR_ERR(rq->page_pool)); > > + rq->page_pool = NULL; > > + } > > +} > > + > > /* How large should a single buffer be so a queue full of these can fit at > > * least one full packet? > > * Logic below assumes the mergeable buffer header is used. > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > vi->sq[i].vq = vqs[txq2vq(i)]; > > + > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > + virtnet_alloc_page_pool(&vi->rq[i]); > > + else > > + dev_warn(&vi->vdev->dev, > > + "page pool only support mergeable mode\n"); > > + > > } > > > > /* run here: ret == 0. */ > > -- > > 2.31.1 > > >
On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > The implementation at the moment uses one page per packet in both the > normal and XDP path. In addition, introducing a module parameter to enable > or disable the usage of page pool (disabled by default). > > In single-core vm testing environments, it gives a modest performance gain > in the normal path. > Upstream codebase: 47.5 Gbits/sec > Upstream codebase + page_pool support: 50.2 Gbits/sec > > In multi-core vm testing environments, The most significant performance > gain is observed in XDP cpumap: > Upstream codebase: 1.38 Gbits/sec > Upstream codebase + page_pool support: 9.74 Gbits/sec > > With this foundation, we can further integrate page pool fragmentation and > DMA map/unmap support. > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> Why off by default? I am guessing it sometimes has performance costs too? What happens if we use page pool for big mode too? The less modes we have the better... > --- > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > 1 file changed, 146 insertions(+), 42 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index c5dca0d92e64..99c0ca0c1781 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > module_param(gso, bool, 0444); > module_param(napi_tx, bool, 0644); > > +static bool page_pool_enabled; > +module_param(page_pool_enabled, bool, 0400); > + > /* FIXME: MTU in config. */ > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > #define GOOD_COPY_LEN 128 > @@ -159,6 +162,9 @@ struct receive_queue { > /* Chain pages by the private ptr. */ > struct page *pages; > > + /* Page pool */ > + struct page_pool *page_pool; > + > /* Average packet length for mergeable receive buffers. */ > struct ewma_pkt_len mrg_avg_pkt_len; > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > return skb; > } > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > +{ > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + put_page(page); > +} > + > /* Called from bottom half context */ > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > struct receive_queue *rq, > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > hdr = skb_vnet_hdr(skb); > memcpy(hdr, hdr_p, hdr_len); > if (page_to_free) > - put_page(page_to_free); > + virtnet_put_page(rq, page_to_free); > > return skb; > } > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > return ret; > } > > -static void put_xdp_frags(struct xdp_buff *xdp) > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > { > struct skb_shared_info *shinfo; > struct page *xdp_page; > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > shinfo = xdp_get_shared_info_from_buff(xdp); > for (i = 0; i < shinfo->nr_frags; i++) { > xdp_page = skb_frag_page(&shinfo->frags[i]); > - put_page(xdp_page); > + virtnet_put_page(rq, xdp_page); > } > } > } > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > if (page_off + *len + tailroom > PAGE_SIZE) > return NULL; > > - page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + page = alloc_page(GFP_ATOMIC); > + > if (!page) > return NULL; > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > * is sending packet larger than the MTU. > */ > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > - put_page(p); > + virtnet_put_page(rq, p); > goto err_buf; > } > > memcpy(page_address(page) + page_off, > page_address(p) + off, buflen); > page_off += buflen; > - put_page(p); > + virtnet_put_page(rq, p); > } > > /* Headroom does not contribute to packet length */ > *len = page_off - VIRTIO_XDP_HEADROOM; > return page; > err_buf: > - __free_pages(page, 0); > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + __free_pages(page, 0); > return NULL; > } > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > } > stats->bytes += len; > page = virt_to_head_page(buf); > - put_page(page); > + virtnet_put_page(rq, page); > } > } > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > cur_frag_size = truesize; > xdp_frags_truesz += cur_frag_size; > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > - put_page(page); > + virtnet_put_page(rq, page); > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > dev->name, len, (unsigned long)(truesize - room)); > dev->stats.rx_length_errors++; > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > return 0; > > err: > - put_xdp_frags(xdp); > + put_xdp_frags(xdp, rq); > return -EINVAL; > } > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > if (*len + xdp_room > PAGE_SIZE) > return NULL; > > - xdp_page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + xdp_page = alloc_page(GFP_ATOMIC); > if (!xdp_page) > return NULL; > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > *frame_sz = PAGE_SIZE; > > - put_page(*page); > + virtnet_put_page(rq, *page); > > *page = xdp_page; > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > if (unlikely(!head_skb)) > break; > + if (rq->page_pool) > + skb_mark_for_recycle(head_skb); > return head_skb; > > case XDP_TX: > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > break; > } > > - put_xdp_frags(&xdp); > + put_xdp_frags(&xdp, rq); > > err_xdp: > - put_page(page); > + virtnet_put_page(rq, page); > mergeable_buf_free(rq, num_buf, dev, stats); > > stats->xdp_drops++; > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > curr_skb = head_skb; > > + if (rq->page_pool) > + skb_mark_for_recycle(curr_skb); > + > if (unlikely(!curr_skb)) > goto err_skb; > while (--num_buf) { > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > curr_skb = nskb; > head_skb->truesize += nskb->truesize; > num_skb_frags = 0; > + if (rq->page_pool) > + skb_mark_for_recycle(curr_skb); > } > if (curr_skb != head_skb) { > head_skb->data_len += len; > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > } > offset = buf - page_address(page); > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > - put_page(page); > + virtnet_put_page(rq, page); > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > len, truesize); > } else { > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > return head_skb; > > err_skb: > - put_page(page); > + virtnet_put_page(rq, page); > mergeable_buf_free(rq, num_buf, dev, stats); > > err_buf: > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > * disabled GSO for XDP, it won't be a big issue. > */ > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > - return -ENOMEM; > + if (rq->page_pool) { > + struct page *page; > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > - buf += headroom; /* advance address leaving hole at front of pkt */ > - get_page(alloc_frag->page); > - alloc_frag->offset += len + room; > - hole = alloc_frag->size - alloc_frag->offset; > - if (hole < len + room) { > - /* To avoid internal fragmentation, if there is very likely not > - * enough space for another buffer, add the remaining space to > - * the current buffer. > - * XDP core assumes that frame_size of xdp_buff and the length > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > - */ > - if (!headroom) > - len += hole; > - alloc_frag->offset += hole; > - } > + page = page_pool_dev_alloc_pages(rq->page_pool); > + if (unlikely(!page)) > + return -ENOMEM; > + buf = (char *)page_address(page); > + buf += headroom; /* advance address leaving hole at front of pkt */ > + } else { > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > + return -ENOMEM; > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > + buf += headroom; /* advance address leaving hole at front of pkt */ > + get_page(alloc_frag->page); > + alloc_frag->offset += len + room; > + hole = alloc_frag->size - alloc_frag->offset; > + if (hole < len + room) { > + /* To avoid internal fragmentation, if there is very likely not > + * enough space for another buffer, add the remaining space to > + * the current buffer. > + * XDP core assumes that frame_size of xdp_buff and the length > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > + */ > + if (!headroom) > + len += hole; > + alloc_frag->offset += hole; > + } > + } > sg_init_one(rq->sg, buf, len); > ctx = mergeable_len_to_ctx(len + room, headroom); > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > if (err < 0) > - put_page(virt_to_head_page(buf)); > + virtnet_put_page(rq, virt_to_head_page(buf)); > > return err; > } > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > if (err < 0) > return err; > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > - MEM_TYPE_PAGE_SHARED, NULL); > + if (vi->rq[qp_index].page_pool) > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > + MEM_TYPE_PAGE_POOL, > + vi->rq[qp_index].page_pool); > + else > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > + MEM_TYPE_PAGE_SHARED, > + NULL); > + > if (err < 0) > goto err_xdp_reg_mem_model; > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > virtnet_sq_stats_desc[j].desc); > } > + page_pool_ethtool_stats_get_strings(p); > break; > } > } > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > switch (sset) { > case ETH_SS_STATS: > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > - VIRTNET_SQ_STATS_LEN); > + VIRTNET_SQ_STATS_LEN + > + (page_pool_enabled && vi->mergeable_rx_bufs ? > + page_pool_ethtool_stats_get_count() : 0)); > default: > return -EOPNOTSUPP; > } > } > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > +{ > +#ifdef CONFIG_PAGE_POOL_STATS > + struct virtnet_info *vi = netdev_priv(dev); > + struct page_pool_stats pp_stats = {}; > + int i; > + > + for (i = 0; i < vi->curr_queue_pairs; i++) { > + if (!vi->rq[i].page_pool) > + continue; > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > + } > + page_pool_ethtool_stats_get(data, &pp_stats); > +#endif /* CONFIG_PAGE_POOL_STATS */ > +} > + > static void virtnet_get_ethtool_stats(struct net_device *dev, > struct ethtool_stats *stats, u64 *data) > { > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > idx += VIRTNET_SQ_STATS_LEN; > } > + > + virtnet_get_page_pool_stats(dev, &data[idx]); > } > > static void virtnet_get_channels(struct net_device *dev, > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > for (i = 0; i < vi->max_queue_pairs; i++) { > __netif_napi_del(&vi->rq[i].napi); > __netif_napi_del(&vi->sq[i].napi); > + if (vi->rq[i].page_pool) > + page_pool_destroy(vi->rq[i].page_pool); > } > > /* We called __netif_napi_del(), > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > struct virtnet_info *vi = vq->vdev->priv; > int i = vq2rxq(vq); > > - if (vi->mergeable_rx_bufs) > - put_page(virt_to_head_page(buf)); > - else if (vi->big_packets) > + if (vi->mergeable_rx_bufs) { > + if (vi->rq[i].page_pool) { > + page_pool_put_full_page(vi->rq[i].page_pool, > + virt_to_head_page(buf), > + true); > + } else { > + put_page(virt_to_head_page(buf)); > + } > + } else if (vi->big_packets) { > give_pages(&vi->rq[i], buf); > - else > + } else { > put_page(virt_to_head_page(buf)); > + } > } > > static void free_unused_bufs(struct virtnet_info *vi) > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > virtnet_free_queues(vi); > } > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > +{ > + struct virtio_device *vdev = rq->vq->vdev; > + > + struct page_pool_params pp_params = { > + .order = 0, > + .pool_size = rq->vq->num_max, > + .nid = dev_to_node(vdev->dev.parent), > + .dev = vdev->dev.parent, > + .offset = 0, > + }; > + > + rq->page_pool = page_pool_create(&pp_params); > + if (IS_ERR(rq->page_pool)) { > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > + PTR_ERR(rq->page_pool)); > + rq->page_pool = NULL; > + } > +} > + > /* How large should a single buffer be so a queue full of these can fit at > * least one full packet? > * Logic below assumes the mergeable buffer header is used. > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > vi->rq[i].vq = vqs[rxq2vq(i)]; > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > vi->sq[i].vq = vqs[txq2vq(i)]; > + > + if (page_pool_enabled && vi->mergeable_rx_bufs) > + virtnet_alloc_page_pool(&vi->rq[i]); > + else > + dev_warn(&vi->vdev->dev, > + "page pool only support mergeable mode\n"); > + > } > > /* run here: ret == 0. */ > -- > 2.31.1
On Sat, May 27, 2023 at 12:11:25AM +0800, kernel test robot wrote: > Hi Liang, > > kernel test robot noticed the following build errors: > > [auto build test ERROR on net-next/main] > > url: https://github.com/intel-lab-lkp/linux/commits/Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805 > base: net-next/main > patch link: https://lore.kernel.org/r/20230526054621.18371-2-liangchen.linux%40gmail.com > patch subject: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance > config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20230526/202305262334.GiFQ3wpG-lkp@intel.com/config) > compiler: gcc-11 (Debian 11.3.0-12) 11.3.0 > reproduce (this is a W=1 build): > # https://github.com/intel-lab-lkp/linux/commit/bfba563f43bba37181d8502cb2e566c32f96ec9e > git remote add linux-review https://github.com/intel-lab-lkp/linux > git fetch --no-tags linux-review Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805 > git checkout bfba563f43bba37181d8502cb2e566c32f96ec9e > # save the config file > mkdir build_dir && cp config build_dir/.config > make W=1 O=build_dir ARCH=x86_64 olddefconfig > make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash > > If you fix the issue, kindly add following tag where applicable > | Reported-by: kernel test robot <lkp@intel.com> > | Closes: https://lore.kernel.org/oe-kbuild-all/202305262334.GiFQ3wpG-lkp@intel.com/ > > All errors (new ones prefixed by >>): > > ld: vmlinux.o: in function `virtnet_find_vqs': > >> virtio_net.c:(.text+0x901fb5): undefined reference to `page_pool_create' > ld: vmlinux.o: in function `add_recvbuf_mergeable.isra.0': > >> virtio_net.c:(.text+0x905618): undefined reference to `page_pool_alloc_pages' > ld: vmlinux.o: in function `xdp_linearize_page': > virtio_net.c:(.text+0x906b6b): undefined reference to `page_pool_alloc_pages' > ld: vmlinux.o: in function `mergeable_xdp_get_buf.isra.0': > virtio_net.c:(.text+0x90728f): undefined reference to `page_pool_alloc_pages' you need to tweak Kconfig to select PAGE_POOL I think. > -- > 0-DAY CI Kernel Test Service > https://github.com/intel/lkp-tests/wiki
On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote: > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote: > > > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > The implementation at the moment uses one page per packet in both the > > > normal and XDP path. > > > > It's better to explain why we need a page pool and how it can help the > > performance. > > > > Sure, I will include that on v2. > > > In addition, introducing a module parameter to enable > > > or disable the usage of page pool (disabled by default). > > > > If page pool wins for most of the cases, any reason to disable it by default? > > > > Thank you for raising the point. It does make sense to enable it by default. I'd like to see more benchmarks pls then, with a variety of packet sizes, udp and tcp. > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > in the normal path. > > > Upstream codebase: 47.5 Gbits/sec > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > In multi-core vm testing environments, The most significant performance > > > gain is observed in XDP cpumap: > > > Upstream codebase: 1.38 Gbits/sec > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > Please show more details on the test. E.g which kinds of tests have > > you measured? > > > > Btw, it would be better to measure PPS as well. > > > > Sure. It will be added on v2. > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > DMA map/unmap support. > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > --- > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do > > the ifdef tricks at least. > > > > Sure. it will be done on v2. > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > --- a/drivers/net/virtio_net.c > > > +++ b/drivers/net/virtio_net.c > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > module_param(gso, bool, 0444); > > > module_param(napi_tx, bool, 0644); > > > > > > +static bool page_pool_enabled; > > > +module_param(page_pool_enabled, bool, 0400); > > > + > > > /* FIXME: MTU in config. */ > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > #define GOOD_COPY_LEN 128 > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > /* Chain pages by the private ptr. */ > > > struct page *pages; > > > > > > + /* Page pool */ > > > + struct page_pool *page_pool; > > > + > > > /* Average packet length for mergeable receive buffers. */ > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > return skb; > > > } > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > +{ > > > + if (rq->page_pool) > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > + else > > > + put_page(page); > > > +} > > > + > > > /* Called from bottom half context */ > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > struct receive_queue *rq, > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > hdr = skb_vnet_hdr(skb); > > > memcpy(hdr, hdr_p, hdr_len); > > > if (page_to_free) > > > - put_page(page_to_free); > > > + virtnet_put_page(rq, page_to_free); > > > > > > return skb; > > > } > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > return ret; > > > } > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > { > > > > rq could be fetched from xdp_rxq_info? > > Yeah, it has the queue_index there. > > > > > struct skb_shared_info *shinfo; > > > struct page *xdp_page; > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > - put_page(xdp_page); > > > + virtnet_put_page(rq, xdp_page); > > > } > > > } > > > } > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > return NULL; > > > > > > - page = alloc_page(GFP_ATOMIC); > > > + if (rq->page_pool) > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > + else > > > + page = alloc_page(GFP_ATOMIC); > > > + > > > if (!page) > > > return NULL; > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > * is sending packet larger than the MTU. > > > */ > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > - put_page(p); > > > + virtnet_put_page(rq, p); > > > goto err_buf; > > > } > > > > > > memcpy(page_address(page) + page_off, > > > page_address(p) + off, buflen); > > > page_off += buflen; > > > - put_page(p); > > > + virtnet_put_page(rq, p); > > > } > > > > > > /* Headroom does not contribute to packet length */ > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > return page; > > > err_buf: > > > - __free_pages(page, 0); > > > + if (rq->page_pool) > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > + else > > > + __free_pages(page, 0); > > > return NULL; > > > } > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > } > > > stats->bytes += len; > > > page = virt_to_head_page(buf); > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > } > > > } > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > cur_frag_size = truesize; > > > xdp_frags_truesz += cur_frag_size; > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > dev->name, len, (unsigned long)(truesize - room)); > > > dev->stats.rx_length_errors++; > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > return 0; > > > > > > err: > > > - put_xdp_frags(xdp); > > > + put_xdp_frags(xdp, rq); > > > return -EINVAL; > > > } > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > if (*len + xdp_room > PAGE_SIZE) > > > return NULL; > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > + if (rq->page_pool) > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > + else > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > if (!xdp_page) > > > return NULL; > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > *frame_sz = PAGE_SIZE; > > > > > > - put_page(*page); > > > + virtnet_put_page(rq, *page); > > > > > > *page = xdp_page; > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > if (unlikely(!head_skb)) > > > break; > > > + if (rq->page_pool) > > > + skb_mark_for_recycle(head_skb); > > > return head_skb; > > > > > > case XDP_TX: > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > break; > > > } > > > > > > - put_xdp_frags(&xdp); > > > + put_xdp_frags(&xdp, rq); > > > > > > err_xdp: > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > stats->xdp_drops++; > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > curr_skb = head_skb; > > > > > > + if (rq->page_pool) > > > + skb_mark_for_recycle(curr_skb); > > > + > > > if (unlikely(!curr_skb)) > > > goto err_skb; > > > while (--num_buf) { > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > curr_skb = nskb; > > > head_skb->truesize += nskb->truesize; > > > num_skb_frags = 0; > > > + if (rq->page_pool) > > > + skb_mark_for_recycle(curr_skb); > > > } > > > if (curr_skb != head_skb) { > > > head_skb->data_len += len; > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > } > > > offset = buf - page_address(page); > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > > I wonder why not we can't do this during buffer allocation like other drivers? > > > > Sorry, I don't quite understand the point here. Would you please > elaborate a bit more? > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > len, truesize); > > > } else { > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > return head_skb; > > > > > > err_skb: > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > err_buf: > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > * disabled GSO for XDP, it won't be a big issue. > > > */ > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > - return -ENOMEM; > > > + if (rq->page_pool) { > > > + struct page *page; > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > - get_page(alloc_frag->page); > > > - alloc_frag->offset += len + room; > > > - hole = alloc_frag->size - alloc_frag->offset; > > > - if (hole < len + room) { > > > - /* To avoid internal fragmentation, if there is very likely not > > > - * enough space for another buffer, add the remaining space to > > > - * the current buffer. > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > - */ > > > - if (!headroom) > > > - len += hole; > > > - alloc_frag->offset += hole; > > > - } > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > + if (unlikely(!page)) > > > + return -ENOMEM; > > > + buf = (char *)page_address(page); > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > + } else { > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > Why not simply use a helper like virtnet_page_frag_refill() and add > > the page_pool allocation logic there? It helps to reduce the > > changeset. > > > > Sure. Will do that on v2. > > > + return -ENOMEM; > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > + get_page(alloc_frag->page); > > > + alloc_frag->offset += len + room; > > > + hole = alloc_frag->size - alloc_frag->offset; > > > + if (hole < len + room) { > > > + /* To avoid internal fragmentation, if there is very likely not > > > + * enough space for another buffer, add the remaining space to > > > + * the current buffer. > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > + */ > > > + if (!headroom) > > > + len += hole; > > > + alloc_frag->offset += hole; > > > + } > > > + } > > > sg_init_one(rq->sg, buf, len); > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > if (err < 0) > > > - put_page(virt_to_head_page(buf)); > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > return err; > > > } > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > if (err < 0) > > > return err; > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > + if (vi->rq[qp_index].page_pool) > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > + MEM_TYPE_PAGE_POOL, > > > + vi->rq[qp_index].page_pool); > > > + else > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > + MEM_TYPE_PAGE_SHARED, > > > + NULL); > > > + > > > if (err < 0) > > > goto err_xdp_reg_mem_model; > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > virtnet_sq_stats_desc[j].desc); > > > } > > > + page_pool_ethtool_stats_get_strings(p); > > > break; > > > } > > > } > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > switch (sset) { > > > case ETH_SS_STATS: > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > - VIRTNET_SQ_STATS_LEN); > > > + VIRTNET_SQ_STATS_LEN + > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > + page_pool_ethtool_stats_get_count() : 0)); > > > default: > > > return -EOPNOTSUPP; > > > } > > > } > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > +{ > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > + struct virtnet_info *vi = netdev_priv(dev); > > > + struct page_pool_stats pp_stats = {}; > > > + int i; > > > + > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > + if (!vi->rq[i].page_pool) > > > + continue; > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > + } > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > +} > > > + > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > struct ethtool_stats *stats, u64 *data) > > > { > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > idx += VIRTNET_SQ_STATS_LEN; > > > } > > > + > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > } > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > __netif_napi_del(&vi->rq[i].napi); > > > __netif_napi_del(&vi->sq[i].napi); > > > + if (vi->rq[i].page_pool) > > > + page_pool_destroy(vi->rq[i].page_pool); > > > } > > > > > > /* We called __netif_napi_del(), > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > struct virtnet_info *vi = vq->vdev->priv; > > > int i = vq2rxq(vq); > > > > > > - if (vi->mergeable_rx_bufs) > > > - put_page(virt_to_head_page(buf)); > > > - else if (vi->big_packets) > > > + if (vi->mergeable_rx_bufs) { > > > + if (vi->rq[i].page_pool) { > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > + virt_to_head_page(buf), > > > + true); > > > + } else { > > > + put_page(virt_to_head_page(buf)); > > > + } > > > + } else if (vi->big_packets) { > > > give_pages(&vi->rq[i], buf); > > > > Any reason only mergeable were modified but not for small and big? > > > > Thanks > > > > Big mode uses the page chain to recycle pages, thus the using of > "private" of the buffer page. I will take further look into that to > see if it is better to use page pool in these cases. Thanks! > > > > > > - else > > > + } else { > > > put_page(virt_to_head_page(buf)); > > > + } > > > } > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > virtnet_free_queues(vi); > > > } > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > +{ > > > + struct virtio_device *vdev = rq->vq->vdev; > > > + > > > + struct page_pool_params pp_params = { > > > + .order = 0, > > > + .pool_size = rq->vq->num_max, > > > + .nid = dev_to_node(vdev->dev.parent), > > > + .dev = vdev->dev.parent, > > > + .offset = 0, > > > + }; > > > + > > > + rq->page_pool = page_pool_create(&pp_params); > > > + if (IS_ERR(rq->page_pool)) { > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > + PTR_ERR(rq->page_pool)); > > > + rq->page_pool = NULL; > > > + } > > > +} > > > + > > > /* How large should a single buffer be so a queue full of these can fit at > > > * least one full packet? > > > * Logic below assumes the mergeable buffer header is used. > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > + > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > + else > > > + dev_warn(&vi->vdev->dev, > > > + "page pool only support mergeable mode\n"); > > > + > > > } > > > > > > /* run here: ret == 0. */ > > > -- > > > 2.31.1 > > > > >
On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > The implementation at the moment uses one page per packet in both the > > normal and XDP path. In addition, introducing a module parameter to enable > > or disable the usage of page pool (disabled by default). > > > > In single-core vm testing environments, it gives a modest performance gain > > in the normal path. > > Upstream codebase: 47.5 Gbits/sec > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > In multi-core vm testing environments, The most significant performance > > gain is observed in XDP cpumap: > > Upstream codebase: 1.38 Gbits/sec > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > With this foundation, we can further integrate page pool fragmentation and > > DMA map/unmap support. > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > Why off by default? > I am guessing it sometimes has performance costs too? > > > What happens if we use page pool for big mode too? > The less modes we have the better... > > Sure, now I believe it makes sense to enable it by default. When the packet size is very small, it reduces the likelihood of skb coalescing. But such cases are rare. The usage of page pool for big mode is being evaluated now. Thanks! > > --- > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index c5dca0d92e64..99c0ca0c1781 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > module_param(gso, bool, 0444); > > module_param(napi_tx, bool, 0644); > > > > +static bool page_pool_enabled; > > +module_param(page_pool_enabled, bool, 0400); > > + > > /* FIXME: MTU in config. */ > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > #define GOOD_COPY_LEN 128 > > @@ -159,6 +162,9 @@ struct receive_queue { > > /* Chain pages by the private ptr. */ > > struct page *pages; > > > > + /* Page pool */ > > + struct page_pool *page_pool; > > + > > /* Average packet length for mergeable receive buffers. */ > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > return skb; > > } > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > +{ > > + if (rq->page_pool) > > + page_pool_put_full_page(rq->page_pool, page, true); > > + else > > + put_page(page); > > +} > > + > > /* Called from bottom half context */ > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > struct receive_queue *rq, > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > hdr = skb_vnet_hdr(skb); > > memcpy(hdr, hdr_p, hdr_len); > > if (page_to_free) > > - put_page(page_to_free); > > + virtnet_put_page(rq, page_to_free); > > > > return skb; > > } > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > return ret; > > } > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > { > > struct skb_shared_info *shinfo; > > struct page *xdp_page; > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > shinfo = xdp_get_shared_info_from_buff(xdp); > > for (i = 0; i < shinfo->nr_frags; i++) { > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > - put_page(xdp_page); > > + virtnet_put_page(rq, xdp_page); > > } > > } > > } > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > if (page_off + *len + tailroom > PAGE_SIZE) > > return NULL; > > > > - page = alloc_page(GFP_ATOMIC); > > + if (rq->page_pool) > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > + else > > + page = alloc_page(GFP_ATOMIC); > > + > > if (!page) > > return NULL; > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > * is sending packet larger than the MTU. > > */ > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > - put_page(p); > > + virtnet_put_page(rq, p); > > goto err_buf; > > } > > > > memcpy(page_address(page) + page_off, > > page_address(p) + off, buflen); > > page_off += buflen; > > - put_page(p); > > + virtnet_put_page(rq, p); > > } > > > > /* Headroom does not contribute to packet length */ > > *len = page_off - VIRTIO_XDP_HEADROOM; > > return page; > > err_buf: > > - __free_pages(page, 0); > > + if (rq->page_pool) > > + page_pool_put_full_page(rq->page_pool, page, true); > > + else > > + __free_pages(page, 0); > > return NULL; > > } > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > } > > stats->bytes += len; > > page = virt_to_head_page(buf); > > - put_page(page); > > + virtnet_put_page(rq, page); > > } > > } > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > cur_frag_size = truesize; > > xdp_frags_truesz += cur_frag_size; > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > - put_page(page); > > + virtnet_put_page(rq, page); > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > dev->name, len, (unsigned long)(truesize - room)); > > dev->stats.rx_length_errors++; > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > return 0; > > > > err: > > - put_xdp_frags(xdp); > > + put_xdp_frags(xdp, rq); > > return -EINVAL; > > } > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > if (*len + xdp_room > PAGE_SIZE) > > return NULL; > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > + if (rq->page_pool) > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > + else > > + xdp_page = alloc_page(GFP_ATOMIC); > > if (!xdp_page) > > return NULL; > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > *frame_sz = PAGE_SIZE; > > > > - put_page(*page); > > + virtnet_put_page(rq, *page); > > > > *page = xdp_page; > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > if (unlikely(!head_skb)) > > break; > > + if (rq->page_pool) > > + skb_mark_for_recycle(head_skb); > > return head_skb; > > > > case XDP_TX: > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > break; > > } > > > > - put_xdp_frags(&xdp); > > + put_xdp_frags(&xdp, rq); > > > > err_xdp: > > - put_page(page); > > + virtnet_put_page(rq, page); > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > stats->xdp_drops++; > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > curr_skb = head_skb; > > > > + if (rq->page_pool) > > + skb_mark_for_recycle(curr_skb); > > + > > if (unlikely(!curr_skb)) > > goto err_skb; > > while (--num_buf) { > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > curr_skb = nskb; > > head_skb->truesize += nskb->truesize; > > num_skb_frags = 0; > > + if (rq->page_pool) > > + skb_mark_for_recycle(curr_skb); > > } > > if (curr_skb != head_skb) { > > head_skb->data_len += len; > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > } > > offset = buf - page_address(page); > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > - put_page(page); > > + virtnet_put_page(rq, page); > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > len, truesize); > > } else { > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > return head_skb; > > > > err_skb: > > - put_page(page); > > + virtnet_put_page(rq, page); > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > err_buf: > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > * disabled GSO for XDP, it won't be a big issue. > > */ > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > - return -ENOMEM; > > + if (rq->page_pool) { > > + struct page *page; > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > - get_page(alloc_frag->page); > > - alloc_frag->offset += len + room; > > - hole = alloc_frag->size - alloc_frag->offset; > > - if (hole < len + room) { > > - /* To avoid internal fragmentation, if there is very likely not > > - * enough space for another buffer, add the remaining space to > > - * the current buffer. > > - * XDP core assumes that frame_size of xdp_buff and the length > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > - */ > > - if (!headroom) > > - len += hole; > > - alloc_frag->offset += hole; > > - } > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > + if (unlikely(!page)) > > + return -ENOMEM; > > + buf = (char *)page_address(page); > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > + } else { > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > + return -ENOMEM; > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > + get_page(alloc_frag->page); > > + alloc_frag->offset += len + room; > > + hole = alloc_frag->size - alloc_frag->offset; > > + if (hole < len + room) { > > + /* To avoid internal fragmentation, if there is very likely not > > + * enough space for another buffer, add the remaining space to > > + * the current buffer. > > + * XDP core assumes that frame_size of xdp_buff and the length > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > + */ > > + if (!headroom) > > + len += hole; > > + alloc_frag->offset += hole; > > + } > > + } > > sg_init_one(rq->sg, buf, len); > > ctx = mergeable_len_to_ctx(len + room, headroom); > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > if (err < 0) > > - put_page(virt_to_head_page(buf)); > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > return err; > > } > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > if (err < 0) > > return err; > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > - MEM_TYPE_PAGE_SHARED, NULL); > > + if (vi->rq[qp_index].page_pool) > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > + MEM_TYPE_PAGE_POOL, > > + vi->rq[qp_index].page_pool); > > + else > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > + MEM_TYPE_PAGE_SHARED, > > + NULL); > > + > > if (err < 0) > > goto err_xdp_reg_mem_model; > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > virtnet_sq_stats_desc[j].desc); > > } > > + page_pool_ethtool_stats_get_strings(p); > > break; > > } > > } > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > switch (sset) { > > case ETH_SS_STATS: > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > - VIRTNET_SQ_STATS_LEN); > > + VIRTNET_SQ_STATS_LEN + > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > + page_pool_ethtool_stats_get_count() : 0)); > > default: > > return -EOPNOTSUPP; > > } > > } > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > +{ > > +#ifdef CONFIG_PAGE_POOL_STATS > > + struct virtnet_info *vi = netdev_priv(dev); > > + struct page_pool_stats pp_stats = {}; > > + int i; > > + > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > + if (!vi->rq[i].page_pool) > > + continue; > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > + } > > + page_pool_ethtool_stats_get(data, &pp_stats); > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > +} > > + > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > struct ethtool_stats *stats, u64 *data) > > { > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > idx += VIRTNET_SQ_STATS_LEN; > > } > > + > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > } > > > > static void virtnet_get_channels(struct net_device *dev, > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > for (i = 0; i < vi->max_queue_pairs; i++) { > > __netif_napi_del(&vi->rq[i].napi); > > __netif_napi_del(&vi->sq[i].napi); > > + if (vi->rq[i].page_pool) > > + page_pool_destroy(vi->rq[i].page_pool); > > } > > > > /* We called __netif_napi_del(), > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > struct virtnet_info *vi = vq->vdev->priv; > > int i = vq2rxq(vq); > > > > - if (vi->mergeable_rx_bufs) > > - put_page(virt_to_head_page(buf)); > > - else if (vi->big_packets) > > + if (vi->mergeable_rx_bufs) { > > + if (vi->rq[i].page_pool) { > > + page_pool_put_full_page(vi->rq[i].page_pool, > > + virt_to_head_page(buf), > > + true); > > + } else { > > + put_page(virt_to_head_page(buf)); > > + } > > + } else if (vi->big_packets) { > > give_pages(&vi->rq[i], buf); > > - else > > + } else { > > put_page(virt_to_head_page(buf)); > > + } > > } > > > > static void free_unused_bufs(struct virtnet_info *vi) > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > virtnet_free_queues(vi); > > } > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > +{ > > + struct virtio_device *vdev = rq->vq->vdev; > > + > > + struct page_pool_params pp_params = { > > + .order = 0, > > + .pool_size = rq->vq->num_max, > > + .nid = dev_to_node(vdev->dev.parent), > > + .dev = vdev->dev.parent, > > + .offset = 0, > > + }; > > + > > + rq->page_pool = page_pool_create(&pp_params); > > + if (IS_ERR(rq->page_pool)) { > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > + PTR_ERR(rq->page_pool)); > > + rq->page_pool = NULL; > > + } > > +} > > + > > /* How large should a single buffer be so a queue full of these can fit at > > * least one full packet? > > * Logic below assumes the mergeable buffer header is used. > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > vi->sq[i].vq = vqs[txq2vq(i)]; > > + > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > + virtnet_alloc_page_pool(&vi->rq[i]); > > + else > > + dev_warn(&vi->vdev->dev, > > + "page pool only support mergeable mode\n"); > > + > > } > > > > /* run here: ret == 0. */ > > -- > > 2.31.1 >
On Sun, May 28, 2023 at 2:27 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Sat, May 27, 2023 at 12:11:25AM +0800, kernel test robot wrote: > > Hi Liang, > > > > kernel test robot noticed the following build errors: > > > > [auto build test ERROR on net-next/main] > > > > url: https://github.com/intel-lab-lkp/linux/commits/Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805 > > base: net-next/main > > patch link: https://lore.kernel.org/r/20230526054621.18371-2-liangchen.linux%40gmail.com > > patch subject: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance > > config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20230526/202305262334.GiFQ3wpG-lkp@intel.com/config) > > compiler: gcc-11 (Debian 11.3.0-12) 11.3.0 > > reproduce (this is a W=1 build): > > # https://github.com/intel-lab-lkp/linux/commit/bfba563f43bba37181d8502cb2e566c32f96ec9e > > git remote add linux-review https://github.com/intel-lab-lkp/linux > > git fetch --no-tags linux-review Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805 > > git checkout bfba563f43bba37181d8502cb2e566c32f96ec9e > > # save the config file > > mkdir build_dir && cp config build_dir/.config > > make W=1 O=build_dir ARCH=x86_64 olddefconfig > > make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash > > > > If you fix the issue, kindly add following tag where applicable > > | Reported-by: kernel test robot <lkp@intel.com> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202305262334.GiFQ3wpG-lkp@intel.com/ > > > > All errors (new ones prefixed by >>): > > > > ld: vmlinux.o: in function `virtnet_find_vqs': > > >> virtio_net.c:(.text+0x901fb5): undefined reference to `page_pool_create' > > ld: vmlinux.o: in function `add_recvbuf_mergeable.isra.0': > > >> virtio_net.c:(.text+0x905618): undefined reference to `page_pool_alloc_pages' > > ld: vmlinux.o: in function `xdp_linearize_page': > > virtio_net.c:(.text+0x906b6b): undefined reference to `page_pool_alloc_pages' > > ld: vmlinux.o: in function `mergeable_xdp_get_buf.isra.0': > > virtio_net.c:(.text+0x90728f): undefined reference to `page_pool_alloc_pages' > > > you need to tweak Kconfig to select PAGE_POOL I think. > Sure, thanks! > > -- > > 0-DAY CI Kernel Test Service > > https://github.com/intel/lkp-tests/wiki >
On Sun, May 28, 2023 at 2:40 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote: > > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > normal and XDP path. > > > > > > It's better to explain why we need a page pool and how it can help the > > > performance. > > > > > > > Sure, I will include that on v2. > > > > In addition, introducing a module parameter to enable > > > > or disable the usage of page pool (disabled by default). > > > > > > If page pool wins for most of the cases, any reason to disable it by default? > > > > > > > Thank you for raising the point. It does make sense to enable it by default. > > I'd like to see more benchmarks pls then, with a variety of packet > sizes, udp and tcp. > Sure, more benchmarks will be provided. Thanks. > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > in the normal path. > > > > Upstream codebase: 47.5 Gbits/sec > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > gain is observed in XDP cpumap: > > > > Upstream codebase: 1.38 Gbits/sec > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > Please show more details on the test. E.g which kinds of tests have > > > you measured? > > > > > > Btw, it would be better to measure PPS as well. > > > > > > > Sure. It will be added on v2. > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > DMA map/unmap support. > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > --- > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do > > > the ifdef tricks at least. > > > > > > > Sure. it will be done on v2. > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > --- a/drivers/net/virtio_net.c > > > > +++ b/drivers/net/virtio_net.c > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > module_param(gso, bool, 0444); > > > > module_param(napi_tx, bool, 0644); > > > > > > > > +static bool page_pool_enabled; > > > > +module_param(page_pool_enabled, bool, 0400); > > > > + > > > > /* FIXME: MTU in config. */ > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > #define GOOD_COPY_LEN 128 > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > /* Chain pages by the private ptr. */ > > > > struct page *pages; > > > > > > > > + /* Page pool */ > > > > + struct page_pool *page_pool; > > > > + > > > > /* Average packet length for mergeable receive buffers. */ > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > return skb; > > > > } > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > +{ > > > > + if (rq->page_pool) > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > + else > > > > + put_page(page); > > > > +} > > > > + > > > > /* Called from bottom half context */ > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > struct receive_queue *rq, > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > hdr = skb_vnet_hdr(skb); > > > > memcpy(hdr, hdr_p, hdr_len); > > > > if (page_to_free) > > > > - put_page(page_to_free); > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > return skb; > > > > } > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > return ret; > > > > } > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > { > > > > > > rq could be fetched from xdp_rxq_info? > > > > Yeah, it has the queue_index there. > > > > > > > struct skb_shared_info *shinfo; > > > > struct page *xdp_page; > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > - put_page(xdp_page); > > > > + virtnet_put_page(rq, xdp_page); > > > > } > > > > } > > > > } > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > return NULL; > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > + if (rq->page_pool) > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > + else > > > > + page = alloc_page(GFP_ATOMIC); > > > > + > > > > if (!page) > > > > return NULL; > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > * is sending packet larger than the MTU. > > > > */ > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > - put_page(p); > > > > + virtnet_put_page(rq, p); > > > > goto err_buf; > > > > } > > > > > > > > memcpy(page_address(page) + page_off, > > > > page_address(p) + off, buflen); > > > > page_off += buflen; > > > > - put_page(p); > > > > + virtnet_put_page(rq, p); > > > > } > > > > > > > > /* Headroom does not contribute to packet length */ > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > return page; > > > > err_buf: > > > > - __free_pages(page, 0); > > > > + if (rq->page_pool) > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > + else > > > > + __free_pages(page, 0); > > > > return NULL; > > > > } > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > } > > > > stats->bytes += len; > > > > page = virt_to_head_page(buf); > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > } > > > > } > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > cur_frag_size = truesize; > > > > xdp_frags_truesz += cur_frag_size; > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > dev->stats.rx_length_errors++; > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > return 0; > > > > > > > > err: > > > > - put_xdp_frags(xdp); > > > > + put_xdp_frags(xdp, rq); > > > > return -EINVAL; > > > > } > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > if (*len + xdp_room > PAGE_SIZE) > > > > return NULL; > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > + if (rq->page_pool) > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > + else > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > if (!xdp_page) > > > > return NULL; > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > - put_page(*page); > > > > + virtnet_put_page(rq, *page); > > > > > > > > *page = xdp_page; > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > if (unlikely(!head_skb)) > > > > break; > > > > + if (rq->page_pool) > > > > + skb_mark_for_recycle(head_skb); > > > > return head_skb; > > > > > > > > case XDP_TX: > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > break; > > > > } > > > > > > > > - put_xdp_frags(&xdp); > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > err_xdp: > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > stats->xdp_drops++; > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > curr_skb = head_skb; > > > > > > > > + if (rq->page_pool) > > > > + skb_mark_for_recycle(curr_skb); > > > > + > > > > if (unlikely(!curr_skb)) > > > > goto err_skb; > > > > while (--num_buf) { > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > curr_skb = nskb; > > > > head_skb->truesize += nskb->truesize; > > > > num_skb_frags = 0; > > > > + if (rq->page_pool) > > > > + skb_mark_for_recycle(curr_skb); > > > > } > > > > if (curr_skb != head_skb) { > > > > head_skb->data_len += len; > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > } > > > > offset = buf - page_address(page); > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > > > I wonder why not we can't do this during buffer allocation like other drivers? > > > > > > > Sorry, I don't quite understand the point here. Would you please > > elaborate a bit more? > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > len, truesize); > > > > } else { > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > return head_skb; > > > > > > > > err_skb: > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > err_buf: > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > * disabled GSO for XDP, it won't be a big issue. > > > > */ > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > - return -ENOMEM; > > > > + if (rq->page_pool) { > > > > + struct page *page; > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > - get_page(alloc_frag->page); > > > > - alloc_frag->offset += len + room; > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > - if (hole < len + room) { > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > - * enough space for another buffer, add the remaining space to > > > > - * the current buffer. > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > - */ > > > > - if (!headroom) > > > > - len += hole; > > > > - alloc_frag->offset += hole; > > > > - } > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > + if (unlikely(!page)) > > > > + return -ENOMEM; > > > > + buf = (char *)page_address(page); > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > + } else { > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > Why not simply use a helper like virtnet_page_frag_refill() and add > > > the page_pool allocation logic there? It helps to reduce the > > > changeset. > > > > > > > Sure. Will do that on v2. > > > > + return -ENOMEM; > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > + get_page(alloc_frag->page); > > > > + alloc_frag->offset += len + room; > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > + if (hole < len + room) { > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > + * enough space for another buffer, add the remaining space to > > > > + * the current buffer. > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > + */ > > > > + if (!headroom) > > > > + len += hole; > > > > + alloc_frag->offset += hole; > > > > + } > > > > + } > > > > sg_init_one(rq->sg, buf, len); > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > if (err < 0) > > > > - put_page(virt_to_head_page(buf)); > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > return err; > > > > } > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > if (err < 0) > > > > return err; > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > + if (vi->rq[qp_index].page_pool) > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > + MEM_TYPE_PAGE_POOL, > > > > + vi->rq[qp_index].page_pool); > > > > + else > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > + MEM_TYPE_PAGE_SHARED, > > > > + NULL); > > > > + > > > > if (err < 0) > > > > goto err_xdp_reg_mem_model; > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > virtnet_sq_stats_desc[j].desc); > > > > } > > > > + page_pool_ethtool_stats_get_strings(p); > > > > break; > > > > } > > > > } > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > switch (sset) { > > > > case ETH_SS_STATS: > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > - VIRTNET_SQ_STATS_LEN); > > > > + VIRTNET_SQ_STATS_LEN + > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > default: > > > > return -EOPNOTSUPP; > > > > } > > > > } > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > +{ > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > + struct page_pool_stats pp_stats = {}; > > > > + int i; > > > > + > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > + if (!vi->rq[i].page_pool) > > > > + continue; > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > + } > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > +} > > > > + > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > struct ethtool_stats *stats, u64 *data) > > > > { > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > } > > > > + > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > } > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > __netif_napi_del(&vi->rq[i].napi); > > > > __netif_napi_del(&vi->sq[i].napi); > > > > + if (vi->rq[i].page_pool) > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > } > > > > > > > > /* We called __netif_napi_del(), > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > int i = vq2rxq(vq); > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > - put_page(virt_to_head_page(buf)); > > > > - else if (vi->big_packets) > > > > + if (vi->mergeable_rx_bufs) { > > > > + if (vi->rq[i].page_pool) { > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > + virt_to_head_page(buf), > > > > + true); > > > > + } else { > > > > + put_page(virt_to_head_page(buf)); > > > > + } > > > > + } else if (vi->big_packets) { > > > > give_pages(&vi->rq[i], buf); > > > > > > Any reason only mergeable were modified but not for small and big? > > > > > > Thanks > > > > > > > Big mode uses the page chain to recycle pages, thus the using of > > "private" of the buffer page. I will take further look into that to > > see if it is better to use page pool in these cases. Thanks! > > > > > > > > > > - else > > > > + } else { > > > > put_page(virt_to_head_page(buf)); > > > > + } > > > > } > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > virtnet_free_queues(vi); > > > > } > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > +{ > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > + > > > > + struct page_pool_params pp_params = { > > > > + .order = 0, > > > > + .pool_size = rq->vq->num_max, > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > + .dev = vdev->dev.parent, > > > > + .offset = 0, > > > > + }; > > > > + > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > + if (IS_ERR(rq->page_pool)) { > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > + PTR_ERR(rq->page_pool)); > > > > + rq->page_pool = NULL; > > > > + } > > > > +} > > > > + > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > * least one full packet? > > > > * Logic below assumes the mergeable buffer header is used. > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > + > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > + else > > > > + dev_warn(&vi->vdev->dev, > > > > + "page pool only support mergeable mode\n"); > > > > + > > > > } > > > > > > > > /* run here: ret == 0. */ > > > > -- > > > > 2.31.1 > > > > > > > >
On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > The implementation at the moment uses one page per packet in both the > > > normal and XDP path. In addition, introducing a module parameter to enable > > > or disable the usage of page pool (disabled by default). > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > in the normal path. > > > Upstream codebase: 47.5 Gbits/sec > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > In multi-core vm testing environments, The most significant performance > > > gain is observed in XDP cpumap: > > > Upstream codebase: 1.38 Gbits/sec > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > DMA map/unmap support. > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > Why off by default? > > I am guessing it sometimes has performance costs too? > > > > > > What happens if we use page pool for big mode too? > > The less modes we have the better... > > > > > > Sure, now I believe it makes sense to enable it by default. When the > packet size is very small, it reduces the likelihood of skb > coalescing. But such cases are rare. small packets are rare? These workloads are easy to create actually. Pls try and include benchmark with small packet size. > The usage of page pool for big mode is being evaluated now. Thanks! > > > > --- > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > --- a/drivers/net/virtio_net.c > > > +++ b/drivers/net/virtio_net.c > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > module_param(gso, bool, 0444); > > > module_param(napi_tx, bool, 0644); > > > > > > +static bool page_pool_enabled; > > > +module_param(page_pool_enabled, bool, 0400); > > > + > > > /* FIXME: MTU in config. */ > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > #define GOOD_COPY_LEN 128 > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > /* Chain pages by the private ptr. */ > > > struct page *pages; > > > > > > + /* Page pool */ > > > + struct page_pool *page_pool; > > > + > > > /* Average packet length for mergeable receive buffers. */ > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > return skb; > > > } > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > +{ > > > + if (rq->page_pool) > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > + else > > > + put_page(page); > > > +} > > > + > > > /* Called from bottom half context */ > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > struct receive_queue *rq, > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > hdr = skb_vnet_hdr(skb); > > > memcpy(hdr, hdr_p, hdr_len); > > > if (page_to_free) > > > - put_page(page_to_free); > > > + virtnet_put_page(rq, page_to_free); > > > > > > return skb; > > > } > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > return ret; > > > } > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > { > > > struct skb_shared_info *shinfo; > > > struct page *xdp_page; > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > - put_page(xdp_page); > > > + virtnet_put_page(rq, xdp_page); > > > } > > > } > > > } > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > return NULL; > > > > > > - page = alloc_page(GFP_ATOMIC); > > > + if (rq->page_pool) > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > + else > > > + page = alloc_page(GFP_ATOMIC); > > > + > > > if (!page) > > > return NULL; > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > * is sending packet larger than the MTU. > > > */ > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > - put_page(p); > > > + virtnet_put_page(rq, p); > > > goto err_buf; > > > } > > > > > > memcpy(page_address(page) + page_off, > > > page_address(p) + off, buflen); > > > page_off += buflen; > > > - put_page(p); > > > + virtnet_put_page(rq, p); > > > } > > > > > > /* Headroom does not contribute to packet length */ > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > return page; > > > err_buf: > > > - __free_pages(page, 0); > > > + if (rq->page_pool) > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > + else > > > + __free_pages(page, 0); > > > return NULL; > > > } > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > } > > > stats->bytes += len; > > > page = virt_to_head_page(buf); > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > } > > > } > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > cur_frag_size = truesize; > > > xdp_frags_truesz += cur_frag_size; > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > dev->name, len, (unsigned long)(truesize - room)); > > > dev->stats.rx_length_errors++; > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > return 0; > > > > > > err: > > > - put_xdp_frags(xdp); > > > + put_xdp_frags(xdp, rq); > > > return -EINVAL; > > > } > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > if (*len + xdp_room > PAGE_SIZE) > > > return NULL; > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > + if (rq->page_pool) > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > + else > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > if (!xdp_page) > > > return NULL; > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > *frame_sz = PAGE_SIZE; > > > > > > - put_page(*page); > > > + virtnet_put_page(rq, *page); > > > > > > *page = xdp_page; > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > if (unlikely(!head_skb)) > > > break; > > > + if (rq->page_pool) > > > + skb_mark_for_recycle(head_skb); > > > return head_skb; > > > > > > case XDP_TX: > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > break; > > > } > > > > > > - put_xdp_frags(&xdp); > > > + put_xdp_frags(&xdp, rq); > > > > > > err_xdp: > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > stats->xdp_drops++; > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > curr_skb = head_skb; > > > > > > + if (rq->page_pool) > > > + skb_mark_for_recycle(curr_skb); > > > + > > > if (unlikely(!curr_skb)) > > > goto err_skb; > > > while (--num_buf) { > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > curr_skb = nskb; > > > head_skb->truesize += nskb->truesize; > > > num_skb_frags = 0; > > > + if (rq->page_pool) > > > + skb_mark_for_recycle(curr_skb); > > > } > > > if (curr_skb != head_skb) { > > > head_skb->data_len += len; > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > } > > > offset = buf - page_address(page); > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > len, truesize); > > > } else { > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > return head_skb; > > > > > > err_skb: > > > - put_page(page); > > > + virtnet_put_page(rq, page); > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > err_buf: > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > * disabled GSO for XDP, it won't be a big issue. > > > */ > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > - return -ENOMEM; > > > + if (rq->page_pool) { > > > + struct page *page; > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > - get_page(alloc_frag->page); > > > - alloc_frag->offset += len + room; > > > - hole = alloc_frag->size - alloc_frag->offset; > > > - if (hole < len + room) { > > > - /* To avoid internal fragmentation, if there is very likely not > > > - * enough space for another buffer, add the remaining space to > > > - * the current buffer. > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > - */ > > > - if (!headroom) > > > - len += hole; > > > - alloc_frag->offset += hole; > > > - } > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > + if (unlikely(!page)) > > > + return -ENOMEM; > > > + buf = (char *)page_address(page); > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > + } else { > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > + return -ENOMEM; > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > + get_page(alloc_frag->page); > > > + alloc_frag->offset += len + room; > > > + hole = alloc_frag->size - alloc_frag->offset; > > > + if (hole < len + room) { > > > + /* To avoid internal fragmentation, if there is very likely not > > > + * enough space for another buffer, add the remaining space to > > > + * the current buffer. > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > + */ > > > + if (!headroom) > > > + len += hole; > > > + alloc_frag->offset += hole; > > > + } > > > + } > > > sg_init_one(rq->sg, buf, len); > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > if (err < 0) > > > - put_page(virt_to_head_page(buf)); > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > return err; > > > } > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > if (err < 0) > > > return err; > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > + if (vi->rq[qp_index].page_pool) > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > + MEM_TYPE_PAGE_POOL, > > > + vi->rq[qp_index].page_pool); > > > + else > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > + MEM_TYPE_PAGE_SHARED, > > > + NULL); > > > + > > > if (err < 0) > > > goto err_xdp_reg_mem_model; > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > virtnet_sq_stats_desc[j].desc); > > > } > > > + page_pool_ethtool_stats_get_strings(p); > > > break; > > > } > > > } > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > switch (sset) { > > > case ETH_SS_STATS: > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > - VIRTNET_SQ_STATS_LEN); > > > + VIRTNET_SQ_STATS_LEN + > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > + page_pool_ethtool_stats_get_count() : 0)); > > > default: > > > return -EOPNOTSUPP; > > > } > > > } > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > +{ > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > + struct virtnet_info *vi = netdev_priv(dev); > > > + struct page_pool_stats pp_stats = {}; > > > + int i; > > > + > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > + if (!vi->rq[i].page_pool) > > > + continue; > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > + } > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > +} > > > + > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > struct ethtool_stats *stats, u64 *data) > > > { > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > idx += VIRTNET_SQ_STATS_LEN; > > > } > > > + > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > } > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > __netif_napi_del(&vi->rq[i].napi); > > > __netif_napi_del(&vi->sq[i].napi); > > > + if (vi->rq[i].page_pool) > > > + page_pool_destroy(vi->rq[i].page_pool); > > > } > > > > > > /* We called __netif_napi_del(), > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > struct virtnet_info *vi = vq->vdev->priv; > > > int i = vq2rxq(vq); > > > > > > - if (vi->mergeable_rx_bufs) > > > - put_page(virt_to_head_page(buf)); > > > - else if (vi->big_packets) > > > + if (vi->mergeable_rx_bufs) { > > > + if (vi->rq[i].page_pool) { > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > + virt_to_head_page(buf), > > > + true); > > > + } else { > > > + put_page(virt_to_head_page(buf)); > > > + } > > > + } else if (vi->big_packets) { > > > give_pages(&vi->rq[i], buf); > > > - else > > > + } else { > > > put_page(virt_to_head_page(buf)); > > > + } > > > } > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > virtnet_free_queues(vi); > > > } > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > +{ > > > + struct virtio_device *vdev = rq->vq->vdev; > > > + > > > + struct page_pool_params pp_params = { > > > + .order = 0, > > > + .pool_size = rq->vq->num_max, > > > + .nid = dev_to_node(vdev->dev.parent), > > > + .dev = vdev->dev.parent, > > > + .offset = 0, > > > + }; > > > + > > > + rq->page_pool = page_pool_create(&pp_params); > > > + if (IS_ERR(rq->page_pool)) { > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > + PTR_ERR(rq->page_pool)); > > > + rq->page_pool = NULL; > > > + } > > > +} > > > + > > > /* How large should a single buffer be so a queue full of these can fit at > > > * least one full packet? > > > * Logic below assumes the mergeable buffer header is used. > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > + > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > + else > > > + dev_warn(&vi->vdev->dev, > > > + "page pool only support mergeable mode\n"); > > > + > > > } > > > > > > /* run here: ret == 0. */ > > > -- > > > 2.31.1 > >
On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > The implementation at the moment uses one page per packet in both the > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > or disable the usage of page pool (disabled by default). > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > in the normal path. > > > > Upstream codebase: 47.5 Gbits/sec > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > gain is observed in XDP cpumap: > > > > Upstream codebase: 1.38 Gbits/sec > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > DMA map/unmap support. > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > Why off by default? > > > I am guessing it sometimes has performance costs too? > > > > > > > > > What happens if we use page pool for big mode too? > > > The less modes we have the better... > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > packet size is very small, it reduces the likelihood of skb > > coalescing. But such cases are rare. > > small packets are rare? These workloads are easy to create actually. > Pls try and include benchmark with small packet size. > Sure, Thanks! > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > --- > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > --- a/drivers/net/virtio_net.c > > > > +++ b/drivers/net/virtio_net.c > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > module_param(gso, bool, 0444); > > > > module_param(napi_tx, bool, 0644); > > > > > > > > +static bool page_pool_enabled; > > > > +module_param(page_pool_enabled, bool, 0400); > > > > + > > > > /* FIXME: MTU in config. */ > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > #define GOOD_COPY_LEN 128 > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > /* Chain pages by the private ptr. */ > > > > struct page *pages; > > > > > > > > + /* Page pool */ > > > > + struct page_pool *page_pool; > > > > + > > > > /* Average packet length for mergeable receive buffers. */ > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > return skb; > > > > } > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > +{ > > > > + if (rq->page_pool) > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > + else > > > > + put_page(page); > > > > +} > > > > + > > > > /* Called from bottom half context */ > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > struct receive_queue *rq, > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > hdr = skb_vnet_hdr(skb); > > > > memcpy(hdr, hdr_p, hdr_len); > > > > if (page_to_free) > > > > - put_page(page_to_free); > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > return skb; > > > > } > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > return ret; > > > > } > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > { > > > > struct skb_shared_info *shinfo; > > > > struct page *xdp_page; > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > - put_page(xdp_page); > > > > + virtnet_put_page(rq, xdp_page); > > > > } > > > > } > > > > } > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > return NULL; > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > + if (rq->page_pool) > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > + else > > > > + page = alloc_page(GFP_ATOMIC); > > > > + > > > > if (!page) > > > > return NULL; > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > * is sending packet larger than the MTU. > > > > */ > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > - put_page(p); > > > > + virtnet_put_page(rq, p); > > > > goto err_buf; > > > > } > > > > > > > > memcpy(page_address(page) + page_off, > > > > page_address(p) + off, buflen); > > > > page_off += buflen; > > > > - put_page(p); > > > > + virtnet_put_page(rq, p); > > > > } > > > > > > > > /* Headroom does not contribute to packet length */ > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > return page; > > > > err_buf: > > > > - __free_pages(page, 0); > > > > + if (rq->page_pool) > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > + else > > > > + __free_pages(page, 0); > > > > return NULL; > > > > } > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > } > > > > stats->bytes += len; > > > > page = virt_to_head_page(buf); > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > } > > > > } > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > cur_frag_size = truesize; > > > > xdp_frags_truesz += cur_frag_size; > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > dev->stats.rx_length_errors++; > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > return 0; > > > > > > > > err: > > > > - put_xdp_frags(xdp); > > > > + put_xdp_frags(xdp, rq); > > > > return -EINVAL; > > > > } > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > if (*len + xdp_room > PAGE_SIZE) > > > > return NULL; > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > + if (rq->page_pool) > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > + else > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > if (!xdp_page) > > > > return NULL; > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > - put_page(*page); > > > > + virtnet_put_page(rq, *page); > > > > > > > > *page = xdp_page; > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > if (unlikely(!head_skb)) > > > > break; > > > > + if (rq->page_pool) > > > > + skb_mark_for_recycle(head_skb); > > > > return head_skb; > > > > > > > > case XDP_TX: > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > break; > > > > } > > > > > > > > - put_xdp_frags(&xdp); > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > err_xdp: > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > stats->xdp_drops++; > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > curr_skb = head_skb; > > > > > > > > + if (rq->page_pool) > > > > + skb_mark_for_recycle(curr_skb); > > > > + > > > > if (unlikely(!curr_skb)) > > > > goto err_skb; > > > > while (--num_buf) { > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > curr_skb = nskb; > > > > head_skb->truesize += nskb->truesize; > > > > num_skb_frags = 0; > > > > + if (rq->page_pool) > > > > + skb_mark_for_recycle(curr_skb); > > > > } > > > > if (curr_skb != head_skb) { > > > > head_skb->data_len += len; > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > } > > > > offset = buf - page_address(page); > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > len, truesize); > > > > } else { > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > return head_skb; > > > > > > > > err_skb: > > > > - put_page(page); > > > > + virtnet_put_page(rq, page); > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > err_buf: > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > * disabled GSO for XDP, it won't be a big issue. > > > > */ > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > - return -ENOMEM; > > > > + if (rq->page_pool) { > > > > + struct page *page; > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > - get_page(alloc_frag->page); > > > > - alloc_frag->offset += len + room; > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > - if (hole < len + room) { > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > - * enough space for another buffer, add the remaining space to > > > > - * the current buffer. > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > - */ > > > > - if (!headroom) > > > > - len += hole; > > > > - alloc_frag->offset += hole; > > > > - } > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > + if (unlikely(!page)) > > > > + return -ENOMEM; > > > > + buf = (char *)page_address(page); > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > + } else { > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > + return -ENOMEM; > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > + get_page(alloc_frag->page); > > > > + alloc_frag->offset += len + room; > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > + if (hole < len + room) { > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > + * enough space for another buffer, add the remaining space to > > > > + * the current buffer. > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > + */ > > > > + if (!headroom) > > > > + len += hole; > > > > + alloc_frag->offset += hole; > > > > + } > > > > + } > > > > sg_init_one(rq->sg, buf, len); > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > if (err < 0) > > > > - put_page(virt_to_head_page(buf)); > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > return err; > > > > } > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > if (err < 0) > > > > return err; > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > + if (vi->rq[qp_index].page_pool) > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > + MEM_TYPE_PAGE_POOL, > > > > + vi->rq[qp_index].page_pool); > > > > + else > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > + MEM_TYPE_PAGE_SHARED, > > > > + NULL); > > > > + > > > > if (err < 0) > > > > goto err_xdp_reg_mem_model; > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > virtnet_sq_stats_desc[j].desc); > > > > } > > > > + page_pool_ethtool_stats_get_strings(p); > > > > break; > > > > } > > > > } > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > switch (sset) { > > > > case ETH_SS_STATS: > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > - VIRTNET_SQ_STATS_LEN); > > > > + VIRTNET_SQ_STATS_LEN + > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > default: > > > > return -EOPNOTSUPP; > > > > } > > > > } > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > +{ > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > + struct page_pool_stats pp_stats = {}; > > > > + int i; > > > > + > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > + if (!vi->rq[i].page_pool) > > > > + continue; > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > + } > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > +} > > > > + > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > struct ethtool_stats *stats, u64 *data) > > > > { > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > } > > > > + > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > } > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > __netif_napi_del(&vi->rq[i].napi); > > > > __netif_napi_del(&vi->sq[i].napi); > > > > + if (vi->rq[i].page_pool) > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > } > > > > > > > > /* We called __netif_napi_del(), > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > int i = vq2rxq(vq); > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > - put_page(virt_to_head_page(buf)); > > > > - else if (vi->big_packets) > > > > + if (vi->mergeable_rx_bufs) { > > > > + if (vi->rq[i].page_pool) { > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > + virt_to_head_page(buf), > > > > + true); > > > > + } else { > > > > + put_page(virt_to_head_page(buf)); > > > > + } > > > > + } else if (vi->big_packets) { > > > > give_pages(&vi->rq[i], buf); > > > > - else > > > > + } else { > > > > put_page(virt_to_head_page(buf)); > > > > + } > > > > } > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > virtnet_free_queues(vi); > > > > } > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > +{ > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > + > > > > + struct page_pool_params pp_params = { > > > > + .order = 0, > > > > + .pool_size = rq->vq->num_max, > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > + .dev = vdev->dev.parent, > > > > + .offset = 0, > > > > + }; > > > > + > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > + if (IS_ERR(rq->page_pool)) { > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > + PTR_ERR(rq->page_pool)); > > > > + rq->page_pool = NULL; > > > > + } > > > > +} > > > > + > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > * least one full packet? > > > > * Logic below assumes the mergeable buffer header is used. > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > + > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > + else > > > > + dev_warn(&vi->vdev->dev, > > > > + "page pool only support mergeable mode\n"); > > > > + > > > > } > > > > > > > > /* run here: ret == 0. */ > > > > -- > > > > 2.31.1 > > > >
On Mon, 29 May 2023 15:28:17 +0800, Liang Chen <liangchen.linux@gmail.com> wrote: > On Sun, May 28, 2023 at 2:40 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote: > > > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > normal and XDP path. > > > > > > > > It's better to explain why we need a page pool and how it can help the > > > > performance. > > > > > > > > > > Sure, I will include that on v2. > > > > > In addition, introducing a module parameter to enable > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > If page pool wins for most of the cases, any reason to disable it by default? > > > > > > > > > > Thank you for raising the point. It does make sense to enable it by default. > > > > I'd like to see more benchmarks pls then, with a variety of packet > > sizes, udp and tcp. > > > > Sure, more benchmarks will be provided. Thanks. I think so. I did this, but I did not found any improve. So I gave up it. Thanks. > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > in the normal path. > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > gain is observed in XDP cpumap: > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > Please show more details on the test. E.g which kinds of tests have > > > > you measured? > > > > > > > > Btw, it would be better to measure PPS as well. > > > > > > > > > > Sure. It will be added on v2. > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > DMA map/unmap support. > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > --- > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do > > > > the ifdef tricks at least. > > > > > > > > > > Sure. it will be done on v2. > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > --- a/drivers/net/virtio_net.c > > > > > +++ b/drivers/net/virtio_net.c > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > module_param(gso, bool, 0444); > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > +static bool page_pool_enabled; > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > + > > > > > /* FIXME: MTU in config. */ > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > #define GOOD_COPY_LEN 128 > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > /* Chain pages by the private ptr. */ > > > > > struct page *pages; > > > > > > > > > > + /* Page pool */ > > > > > + struct page_pool *page_pool; > > > > > + > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > return skb; > > > > > } > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > +{ > > > > > + if (rq->page_pool) > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > + else > > > > > + put_page(page); > > > > > +} > > > > > + > > > > > /* Called from bottom half context */ > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > struct receive_queue *rq, > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > hdr = skb_vnet_hdr(skb); > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > if (page_to_free) > > > > > - put_page(page_to_free); > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > return skb; > > > > > } > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > return ret; > > > > > } > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > { > > > > > > > > rq could be fetched from xdp_rxq_info? > > > > > > Yeah, it has the queue_index there. > > > > > > > > > struct skb_shared_info *shinfo; > > > > > struct page *xdp_page; > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > - put_page(xdp_page); > > > > > + virtnet_put_page(rq, xdp_page); > > > > > } > > > > > } > > > > > } > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > return NULL; > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > + if (rq->page_pool) > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > + else > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > + > > > > > if (!page) > > > > > return NULL; > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > * is sending packet larger than the MTU. > > > > > */ > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > - put_page(p); > > > > > + virtnet_put_page(rq, p); > > > > > goto err_buf; > > > > > } > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > page_address(p) + off, buflen); > > > > > page_off += buflen; > > > > > - put_page(p); > > > > > + virtnet_put_page(rq, p); > > > > > } > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > return page; > > > > > err_buf: > > > > > - __free_pages(page, 0); > > > > > + if (rq->page_pool) > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > + else > > > > > + __free_pages(page, 0); > > > > > return NULL; > > > > > } > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > } > > > > > stats->bytes += len; > > > > > page = virt_to_head_page(buf); > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > } > > > > > } > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > cur_frag_size = truesize; > > > > > xdp_frags_truesz += cur_frag_size; > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > dev->stats.rx_length_errors++; > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > return 0; > > > > > > > > > > err: > > > > > - put_xdp_frags(xdp); > > > > > + put_xdp_frags(xdp, rq); > > > > > return -EINVAL; > > > > > } > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > return NULL; > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > + if (rq->page_pool) > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > + else > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > if (!xdp_page) > > > > > return NULL; > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > - put_page(*page); > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > *page = xdp_page; > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > if (unlikely(!head_skb)) > > > > > break; > > > > > + if (rq->page_pool) > > > > > + skb_mark_for_recycle(head_skb); > > > > > return head_skb; > > > > > > > > > > case XDP_TX: > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > break; > > > > > } > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > err_xdp: > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > stats->xdp_drops++; > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > curr_skb = head_skb; > > > > > > > > > > + if (rq->page_pool) > > > > > + skb_mark_for_recycle(curr_skb); > > > > > + > > > > > if (unlikely(!curr_skb)) > > > > > goto err_skb; > > > > > while (--num_buf) { > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > curr_skb = nskb; > > > > > head_skb->truesize += nskb->truesize; > > > > > num_skb_frags = 0; > > > > > + if (rq->page_pool) > > > > > + skb_mark_for_recycle(curr_skb); > > > > > } > > > > > if (curr_skb != head_skb) { > > > > > head_skb->data_len += len; > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > } > > > > > offset = buf - page_address(page); > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > > > > I wonder why not we can't do this during buffer allocation like other drivers? > > > > > > > > > > Sorry, I don't quite understand the point here. Would you please > > > elaborate a bit more? > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > len, truesize); > > > > > } else { > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > return head_skb; > > > > > > > > > > err_skb: > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > err_buf: > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > */ > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > - return -ENOMEM; > > > > > + if (rq->page_pool) { > > > > > + struct page *page; > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > - get_page(alloc_frag->page); > > > > > - alloc_frag->offset += len + room; > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > - if (hole < len + room) { > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > - * enough space for another buffer, add the remaining space to > > > > > - * the current buffer. > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > - */ > > > > > - if (!headroom) > > > > > - len += hole; > > > > > - alloc_frag->offset += hole; > > > > > - } > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > + if (unlikely(!page)) > > > > > + return -ENOMEM; > > > > > + buf = (char *)page_address(page); > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > + } else { > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > Why not simply use a helper like virtnet_page_frag_refill() and add > > > > the page_pool allocation logic there? It helps to reduce the > > > > changeset. > > > > > > > > > > Sure. Will do that on v2. > > > > > + return -ENOMEM; > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > + get_page(alloc_frag->page); > > > > > + alloc_frag->offset += len + room; > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > + if (hole < len + room) { > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > + * enough space for another buffer, add the remaining space to > > > > > + * the current buffer. > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > + */ > > > > > + if (!headroom) > > > > > + len += hole; > > > > > + alloc_frag->offset += hole; > > > > > + } > > > > > + } > > > > > sg_init_one(rq->sg, buf, len); > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > if (err < 0) > > > > > - put_page(virt_to_head_page(buf)); > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > return err; > > > > > } > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > if (err < 0) > > > > > return err; > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > + if (vi->rq[qp_index].page_pool) > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > + MEM_TYPE_PAGE_POOL, > > > > > + vi->rq[qp_index].page_pool); > > > > > + else > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > + NULL); > > > > > + > > > > > if (err < 0) > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > virtnet_sq_stats_desc[j].desc); > > > > > } > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > break; > > > > > } > > > > > } > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > switch (sset) { > > > > > case ETH_SS_STATS: > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > default: > > > > > return -EOPNOTSUPP; > > > > > } > > > > > } > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > +{ > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > + struct page_pool_stats pp_stats = {}; > > > > > + int i; > > > > > + > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > + if (!vi->rq[i].page_pool) > > > > > + continue; > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > + } > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > +} > > > > > + > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > struct ethtool_stats *stats, u64 *data) > > > > > { > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > } > > > > > + > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > } > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > + if (vi->rq[i].page_pool) > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > } > > > > > > > > > > /* We called __netif_napi_del(), > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > int i = vq2rxq(vq); > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > - put_page(virt_to_head_page(buf)); > > > > > - else if (vi->big_packets) > > > > > + if (vi->mergeable_rx_bufs) { > > > > > + if (vi->rq[i].page_pool) { > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > + virt_to_head_page(buf), > > > > > + true); > > > > > + } else { > > > > > + put_page(virt_to_head_page(buf)); > > > > > + } > > > > > + } else if (vi->big_packets) { > > > > > give_pages(&vi->rq[i], buf); > > > > > > > > Any reason only mergeable were modified but not for small and big? > > > > > > > > Thanks > > > > > > > > > > Big mode uses the page chain to recycle pages, thus the using of > > > "private" of the buffer page. I will take further look into that to > > > see if it is better to use page pool in these cases. Thanks! > > > > > > > > > > > > > > - else > > > > > + } else { > > > > > put_page(virt_to_head_page(buf)); > > > > > + } > > > > > } > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > virtnet_free_queues(vi); > > > > > } > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > +{ > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > + > > > > > + struct page_pool_params pp_params = { > > > > > + .order = 0, > > > > > + .pool_size = rq->vq->num_max, > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > + .dev = vdev->dev.parent, > > > > > + .offset = 0, > > > > > + }; > > > > > + > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > + PTR_ERR(rq->page_pool)); > > > > > + rq->page_pool = NULL; > > > > > + } > > > > > +} > > > > > + > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > * least one full packet? > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > + > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > + else > > > > > + dev_warn(&vi->vdev->dev, > > > > > + "page pool only support mergeable mode\n"); > > > > > + > > > > > } > > > > > > > > > > /* run here: ret == 0. */ > > > > > -- > > > > > 2.31.1 > > > > > > > > > > >
On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > The implementation at the moment uses one page per packet in both the > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > in the normal path. > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > gain is observed in XDP cpumap: > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > DMA map/unmap support. > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > Why off by default? > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > The less modes we have the better... > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > packet size is very small, it reduces the likelihood of skb > > > coalescing. But such cases are rare. > > > > small packets are rare? These workloads are easy to create actually. > > Pls try and include benchmark with small packet size. > > > > Sure, Thanks! Before going ahead and posting v2 patch, I would like to hear more advice for the cases of small packets. I have done more performance benchmark with small packets since then. Here is a list of iperf output, With PP and PP fragmenting: 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 223 KBytes 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 324 KBytes 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 1.08 MBytes 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 744 KBytes 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes Without PP: 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes The major factor contributing to the performance drop is the reduction of skb coalescing. Additionally, without the page pool, small packets can still benefit from the allocation of 8 continuous pages by breaking them down into smaller pieces. This effectively reduces the frequency of page allocation from the buddy system. For instance, the arrival of 32 1K packets only triggers one alloc_page call. Therefore, the benefits of using a page pool are limited in such cases. In fact, without page pool fragmenting enabled, it can even hinder performance from this perspective. Upon further consideration, I tend to believe making page pool the default option may not be appropriate. As you pointed out, we cannot simply ignore the performance impact on small packets. Any comments on this will be much appreciated. Thanks, Liang > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > --- > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > --- a/drivers/net/virtio_net.c > > > > > +++ b/drivers/net/virtio_net.c > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > module_param(gso, bool, 0444); > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > +static bool page_pool_enabled; > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > + > > > > > /* FIXME: MTU in config. */ > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > #define GOOD_COPY_LEN 128 > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > /* Chain pages by the private ptr. */ > > > > > struct page *pages; > > > > > > > > > > + /* Page pool */ > > > > > + struct page_pool *page_pool; > > > > > + > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > return skb; > > > > > } > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > +{ > > > > > + if (rq->page_pool) > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > + else > > > > > + put_page(page); > > > > > +} > > > > > + > > > > > /* Called from bottom half context */ > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > struct receive_queue *rq, > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > hdr = skb_vnet_hdr(skb); > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > if (page_to_free) > > > > > - put_page(page_to_free); > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > return skb; > > > > > } > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > return ret; > > > > > } > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > { > > > > > struct skb_shared_info *shinfo; > > > > > struct page *xdp_page; > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > - put_page(xdp_page); > > > > > + virtnet_put_page(rq, xdp_page); > > > > > } > > > > > } > > > > > } > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > return NULL; > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > + if (rq->page_pool) > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > + else > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > + > > > > > if (!page) > > > > > return NULL; > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > * is sending packet larger than the MTU. > > > > > */ > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > - put_page(p); > > > > > + virtnet_put_page(rq, p); > > > > > goto err_buf; > > > > > } > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > page_address(p) + off, buflen); > > > > > page_off += buflen; > > > > > - put_page(p); > > > > > + virtnet_put_page(rq, p); > > > > > } > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > return page; > > > > > err_buf: > > > > > - __free_pages(page, 0); > > > > > + if (rq->page_pool) > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > + else > > > > > + __free_pages(page, 0); > > > > > return NULL; > > > > > } > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > } > > > > > stats->bytes += len; > > > > > page = virt_to_head_page(buf); > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > } > > > > > } > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > cur_frag_size = truesize; > > > > > xdp_frags_truesz += cur_frag_size; > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > dev->stats.rx_length_errors++; > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > return 0; > > > > > > > > > > err: > > > > > - put_xdp_frags(xdp); > > > > > + put_xdp_frags(xdp, rq); > > > > > return -EINVAL; > > > > > } > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > return NULL; > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > + if (rq->page_pool) > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > + else > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > if (!xdp_page) > > > > > return NULL; > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > - put_page(*page); > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > *page = xdp_page; > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > if (unlikely(!head_skb)) > > > > > break; > > > > > + if (rq->page_pool) > > > > > + skb_mark_for_recycle(head_skb); > > > > > return head_skb; > > > > > > > > > > case XDP_TX: > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > break; > > > > > } > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > err_xdp: > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > stats->xdp_drops++; > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > curr_skb = head_skb; > > > > > > > > > > + if (rq->page_pool) > > > > > + skb_mark_for_recycle(curr_skb); > > > > > + > > > > > if (unlikely(!curr_skb)) > > > > > goto err_skb; > > > > > while (--num_buf) { > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > curr_skb = nskb; > > > > > head_skb->truesize += nskb->truesize; > > > > > num_skb_frags = 0; > > > > > + if (rq->page_pool) > > > > > + skb_mark_for_recycle(curr_skb); > > > > > } > > > > > if (curr_skb != head_skb) { > > > > > head_skb->data_len += len; > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > } > > > > > offset = buf - page_address(page); > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > len, truesize); > > > > > } else { > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > return head_skb; > > > > > > > > > > err_skb: > > > > > - put_page(page); > > > > > + virtnet_put_page(rq, page); > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > err_buf: > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > */ > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > - return -ENOMEM; > > > > > + if (rq->page_pool) { > > > > > + struct page *page; > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > - get_page(alloc_frag->page); > > > > > - alloc_frag->offset += len + room; > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > - if (hole < len + room) { > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > - * enough space for another buffer, add the remaining space to > > > > > - * the current buffer. > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > - */ > > > > > - if (!headroom) > > > > > - len += hole; > > > > > - alloc_frag->offset += hole; > > > > > - } > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > + if (unlikely(!page)) > > > > > + return -ENOMEM; > > > > > + buf = (char *)page_address(page); > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > + } else { > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > + return -ENOMEM; > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > + get_page(alloc_frag->page); > > > > > + alloc_frag->offset += len + room; > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > + if (hole < len + room) { > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > + * enough space for another buffer, add the remaining space to > > > > > + * the current buffer. > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > + */ > > > > > + if (!headroom) > > > > > + len += hole; > > > > > + alloc_frag->offset += hole; > > > > > + } > > > > > + } > > > > > sg_init_one(rq->sg, buf, len); > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > if (err < 0) > > > > > - put_page(virt_to_head_page(buf)); > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > return err; > > > > > } > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > if (err < 0) > > > > > return err; > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > + if (vi->rq[qp_index].page_pool) > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > + MEM_TYPE_PAGE_POOL, > > > > > + vi->rq[qp_index].page_pool); > > > > > + else > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > + NULL); > > > > > + > > > > > if (err < 0) > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > virtnet_sq_stats_desc[j].desc); > > > > > } > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > break; > > > > > } > > > > > } > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > switch (sset) { > > > > > case ETH_SS_STATS: > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > default: > > > > > return -EOPNOTSUPP; > > > > > } > > > > > } > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > +{ > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > + struct page_pool_stats pp_stats = {}; > > > > > + int i; > > > > > + > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > + if (!vi->rq[i].page_pool) > > > > > + continue; > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > + } > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > +} > > > > > + > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > struct ethtool_stats *stats, u64 *data) > > > > > { > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > } > > > > > + > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > } > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > + if (vi->rq[i].page_pool) > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > } > > > > > > > > > > /* We called __netif_napi_del(), > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > int i = vq2rxq(vq); > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > - put_page(virt_to_head_page(buf)); > > > > > - else if (vi->big_packets) > > > > > + if (vi->mergeable_rx_bufs) { > > > > > + if (vi->rq[i].page_pool) { > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > + virt_to_head_page(buf), > > > > > + true); > > > > > + } else { > > > > > + put_page(virt_to_head_page(buf)); > > > > > + } > > > > > + } else if (vi->big_packets) { > > > > > give_pages(&vi->rq[i], buf); > > > > > - else > > > > > + } else { > > > > > put_page(virt_to_head_page(buf)); > > > > > + } > > > > > } > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > virtnet_free_queues(vi); > > > > > } > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > +{ > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > + > > > > > + struct page_pool_params pp_params = { > > > > > + .order = 0, > > > > > + .pool_size = rq->vq->num_max, > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > + .dev = vdev->dev.parent, > > > > > + .offset = 0, > > > > > + }; > > > > > + > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > + PTR_ERR(rq->page_pool)); > > > > > + rq->page_pool = NULL; > > > > > + } > > > > > +} > > > > > + > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > * least one full packet? > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > + > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > + else > > > > > + dev_warn(&vi->vdev->dev, > > > > > + "page pool only support mergeable mode\n"); > > > > > + > > > > > } > > > > > > > > > > /* run here: ret == 0. */ > > > > > -- > > > > > 2.31.1 > > > > > >
On Wed, May 31, 2023 at 11:12 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > On Mon, 29 May 2023 15:28:17 +0800, Liang Chen <liangchen.linux@gmail.com> wrote: > > On Sun, May 28, 2023 at 2:40 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote: > > > > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > normal and XDP path. > > > > > > > > > > It's better to explain why we need a page pool and how it can help the > > > > > performance. > > > > > > > > > > > > > Sure, I will include that on v2. > > > > > > In addition, introducing a module parameter to enable > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > If page pool wins for most of the cases, any reason to disable it by default? > > > > > > > > > > > > > Thank you for raising the point. It does make sense to enable it by default. > > > > > > I'd like to see more benchmarks pls then, with a variety of packet > > > sizes, udp and tcp. > > > > > > > Sure, more benchmarks will be provided. Thanks. > > > I think so. > > I did this, but I did not found any improve. So I gave up it. > > Thanks. > > Our UDP benchmark shows a steady 0.8 percent change in PPS measurement. However, when conducting iperf TCP stream performance testing, the results vary depending on the packet size and testing setup. With small packet sizes, the performance actually drops slightly due to the reasons I explained in the previous email. On the other hand, with large packets, we need to ensure that the sender side doesn't become the bottleneck. To achieve this, our setup uses a single-core vm to keep the receiver busy, which allows us to identify performance differences in the receiving path. Thanks, Liang > > > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > in the normal path. > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > gain is observed in XDP cpumap: > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > Please show more details on the test. E.g which kinds of tests have > > > > > you measured? > > > > > > > > > > Btw, it would be better to measure PPS as well. > > > > > > > > > > > > > Sure. It will be added on v2. > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > DMA map/unmap support. > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > --- > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > > > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do > > > > > the ifdef tricks at least. > > > > > > > > > > > > > Sure. it will be done on v2. > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > --- a/drivers/net/virtio_net.c > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > module_param(gso, bool, 0444); > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > + > > > > > > /* FIXME: MTU in config. */ > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > #define GOOD_COPY_LEN 128 > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > /* Chain pages by the private ptr. */ > > > > > > struct page *pages; > > > > > > > > > > > > + /* Page pool */ > > > > > > + struct page_pool *page_pool; > > > > > > + > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > return skb; > > > > > > } > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > +{ > > > > > > + if (rq->page_pool) > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > + else > > > > > > + put_page(page); > > > > > > +} > > > > > > + > > > > > > /* Called from bottom half context */ > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > struct receive_queue *rq, > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > if (page_to_free) > > > > > > - put_page(page_to_free); > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > return skb; > > > > > > } > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > return ret; > > > > > > } > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > { > > > > > > > > > > rq could be fetched from xdp_rxq_info? > > > > > > > > Yeah, it has the queue_index there. > > > > > > > > > > > struct skb_shared_info *shinfo; > > > > > > struct page *xdp_page; > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > - put_page(xdp_page); > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > } > > > > > > } > > > > > > } > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > return NULL; > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > + if (rq->page_pool) > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + else > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > + > > > > > > if (!page) > > > > > > return NULL; > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > * is sending packet larger than the MTU. > > > > > > */ > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > - put_page(p); > > > > > > + virtnet_put_page(rq, p); > > > > > > goto err_buf; > > > > > > } > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > page_address(p) + off, buflen); > > > > > > page_off += buflen; > > > > > > - put_page(p); > > > > > > + virtnet_put_page(rq, p); > > > > > > } > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > return page; > > > > > > err_buf: > > > > > > - __free_pages(page, 0); > > > > > > + if (rq->page_pool) > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > + else > > > > > > + __free_pages(page, 0); > > > > > > return NULL; > > > > > > } > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > } > > > > > > stats->bytes += len; > > > > > > page = virt_to_head_page(buf); > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > } > > > > > > } > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > cur_frag_size = truesize; > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > dev->stats.rx_length_errors++; > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > return 0; > > > > > > > > > > > > err: > > > > > > - put_xdp_frags(xdp); > > > > > > + put_xdp_frags(xdp, rq); > > > > > > return -EINVAL; > > > > > > } > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > return NULL; > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > + if (rq->page_pool) > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + else > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > if (!xdp_page) > > > > > > return NULL; > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > - put_page(*page); > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > if (unlikely(!head_skb)) > > > > > > break; > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > return head_skb; > > > > > > > > > > > > case XDP_TX: > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > break; > > > > > > } > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > err_xdp: > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > stats->xdp_drops++; > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > curr_skb = head_skb; > > > > > > > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > + > > > > > > if (unlikely(!curr_skb)) > > > > > > goto err_skb; > > > > > > while (--num_buf) { > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > curr_skb = nskb; > > > > > > head_skb->truesize += nskb->truesize; > > > > > > num_skb_frags = 0; > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > } > > > > > > if (curr_skb != head_skb) { > > > > > > head_skb->data_len += len; > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > } > > > > > > offset = buf - page_address(page); > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > I wonder why not we can't do this during buffer allocation like other drivers? > > > > > > > > > > > > > Sorry, I don't quite understand the point here. Would you please > > > > elaborate a bit more? > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > len, truesize); > > > > > > } else { > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > return head_skb; > > > > > > > > > > > > err_skb: > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > err_buf: > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > */ > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > - return -ENOMEM; > > > > > > + if (rq->page_pool) { > > > > > > + struct page *page; > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > - get_page(alloc_frag->page); > > > > > > - alloc_frag->offset += len + room; > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > - if (hole < len + room) { > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > - * the current buffer. > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > - */ > > > > > > - if (!headroom) > > > > > > - len += hole; > > > > > > - alloc_frag->offset += hole; > > > > > > - } > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + if (unlikely(!page)) > > > > > > + return -ENOMEM; > > > > > > + buf = (char *)page_address(page); > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > + } else { > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > > > Why not simply use a helper like virtnet_page_frag_refill() and add > > > > > the page_pool allocation logic there? It helps to reduce the > > > > > changeset. > > > > > > > > > > > > > Sure. Will do that on v2. > > > > > > + return -ENOMEM; > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > + get_page(alloc_frag->page); > > > > > > + alloc_frag->offset += len + room; > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > + if (hole < len + room) { > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > + * the current buffer. > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > + */ > > > > > > + if (!headroom) > > > > > > + len += hole; > > > > > > + alloc_frag->offset += hole; > > > > > > + } > > > > > > + } > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > if (err < 0) > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > return err; > > > > > > } > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > if (err < 0) > > > > > > return err; > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > + vi->rq[qp_index].page_pool); > > > > > > + else > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > + NULL); > > > > > > + > > > > > > if (err < 0) > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > } > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > break; > > > > > > } > > > > > > } > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > switch (sset) { > > > > > > case ETH_SS_STATS: > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > default: > > > > > > return -EOPNOTSUPP; > > > > > > } > > > > > > } > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > +{ > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > + int i; > > > > > > + > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > + if (!vi->rq[i].page_pool) > > > > > > + continue; > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > + } > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > +} > > > > > > + > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > { > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > } > > > > > > + > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > } > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > + if (vi->rq[i].page_pool) > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > } > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > - else if (vi->big_packets) > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > + if (vi->rq[i].page_pool) { > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > + virt_to_head_page(buf), > > > > > > + true); > > > > > > + } else { > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > + } > > > > > > + } else if (vi->big_packets) { > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > > > > Any reason only mergeable were modified but not for small and big? > > > > > > > > > > Thanks > > > > > > > > > > > > > Big mode uses the page chain to recycle pages, thus the using of > > > > "private" of the buffer page. I will take further look into that to > > > > see if it is better to use page pool in these cases. Thanks! > > > > > > > > > > > > > > > > > > - else > > > > > > + } else { > > > > > > put_page(virt_to_head_page(buf)); > > > > > > + } > > > > > > } > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > virtnet_free_queues(vi); > > > > > > } > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > +{ > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > + > > > > > > + struct page_pool_params pp_params = { > > > > > > + .order = 0, > > > > > > + .pool_size = rq->vq->num_max, > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > + .dev = vdev->dev.parent, > > > > > > + .offset = 0, > > > > > > + }; > > > > > > + > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > + rq->page_pool = NULL; > > > > > > + } > > > > > > +} > > > > > > + > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > * least one full packet? > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > + > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > + else > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > + "page pool only support mergeable mode\n"); > > > > > > + > > > > > > } > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > -- > > > > > > 2.31.1 > > > > > > > > > > > > > >
On Wed, 7 Jun 2023 17:11:44 +0800, Liang Chen <liangchen.linux@gmail.com> wrote: > On Wed, May 31, 2023 at 11:12 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > > > On Mon, 29 May 2023 15:28:17 +0800, Liang Chen <liangchen.linux@gmail.com> wrote: > > > On Sun, May 28, 2023 at 2:40 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote: > > > > > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > > > > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > normal and XDP path. > > > > > > > > > > > > It's better to explain why we need a page pool and how it can help the > > > > > > performance. > > > > > > > > > > > > > > > > Sure, I will include that on v2. > > > > > > > In addition, introducing a module parameter to enable > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > If page pool wins for most of the cases, any reason to disable it by default? > > > > > > > > > > > > > > > > Thank you for raising the point. It does make sense to enable it by default. > > > > > > > > I'd like to see more benchmarks pls then, with a variety of packet > > > > sizes, udp and tcp. > > > > > > > > > > Sure, more benchmarks will be provided. Thanks. > > > > > > I think so. > > > > I did this, but I did not found any improve. So I gave up it. > > > > Thanks. > > > > > > Our UDP benchmark shows a steady 0.8 percent change in PPS > measurement. However, when conducting iperf TCP stream performance > testing, the results vary depending on the packet size and testing > setup. With small packet sizes, the performance actually drops > slightly due to the reasons I explained in the previous email. On the > other hand, with large packets, we need to ensure that the sender side > doesn't become the bottleneck. To achieve this, our setup uses a > single-core vm to keep the receiver busy, which allows us to identify > performance differences in the receiving path. Could you show some numbers? Thanks. > > > Thanks, > Liang > > > > > > > > > > > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > in the normal path. > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > gain is observed in XDP cpumap: > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > Please show more details on the test. E.g which kinds of tests have > > > > > > you measured? > > > > > > > > > > > > Btw, it would be better to measure PPS as well. > > > > > > > > > > > > > > > > Sure. It will be added on v2. > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > --- > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > > > > > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do > > > > > > the ifdef tricks at least. > > > > > > > > > > > > > > > > Sure. it will be done on v2. > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > module_param(gso, bool, 0444); > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > + > > > > > > > /* FIXME: MTU in config. */ > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > struct page *pages; > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > + struct page_pool *page_pool; > > > > > > > + > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > return skb; > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > +{ > > > > > > > + if (rq->page_pool) > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > + else > > > > > > > + put_page(page); > > > > > > > +} > > > > > > > + > > > > > > > /* Called from bottom half context */ > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > struct receive_queue *rq, > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > if (page_to_free) > > > > > > > - put_page(page_to_free); > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > return skb; > > > > > > > } > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > return ret; > > > > > > > } > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > { > > > > > > > > > > > > rq could be fetched from xdp_rxq_info? > > > > > > > > > > Yeah, it has the queue_index there. > > > > > > > > > > > > > struct skb_shared_info *shinfo; > > > > > > > struct page *xdp_page; > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > - put_page(xdp_page); > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > } > > > > > > > } > > > > > > > } > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > return NULL; > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > + if (rq->page_pool) > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + else > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > + > > > > > > > if (!page) > > > > > > > return NULL; > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > * is sending packet larger than the MTU. > > > > > > > */ > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > - put_page(p); > > > > > > > + virtnet_put_page(rq, p); > > > > > > > goto err_buf; > > > > > > > } > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > page_address(p) + off, buflen); > > > > > > > page_off += buflen; > > > > > > > - put_page(p); > > > > > > > + virtnet_put_page(rq, p); > > > > > > > } > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > return page; > > > > > > > err_buf: > > > > > > > - __free_pages(page, 0); > > > > > > > + if (rq->page_pool) > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > + else > > > > > > > + __free_pages(page, 0); > > > > > > > return NULL; > > > > > > > } > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > } > > > > > > > stats->bytes += len; > > > > > > > page = virt_to_head_page(buf); > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > cur_frag_size = truesize; > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > dev->stats.rx_length_errors++; > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > return 0; > > > > > > > > > > > > > > err: > > > > > > > - put_xdp_frags(xdp); > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > return -EINVAL; > > > > > > > } > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > return NULL; > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > + if (rq->page_pool) > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + else > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > if (!xdp_page) > > > > > > > return NULL; > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > - put_page(*page); > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > if (unlikely(!head_skb)) > > > > > > > break; > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > return head_skb; > > > > > > > > > > > > > > case XDP_TX: > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > break; > > > > > > > } > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > err_xdp: > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > + > > > > > > > if (unlikely(!curr_skb)) > > > > > > > goto err_skb; > > > > > > > while (--num_buf) { > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > curr_skb = nskb; > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > num_skb_frags = 0; > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > } > > > > > > > if (curr_skb != head_skb) { > > > > > > > head_skb->data_len += len; > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > } > > > > > > > offset = buf - page_address(page); > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > > > I wonder why not we can't do this during buffer allocation like other drivers? > > > > > > > > > > > > > > > > Sorry, I don't quite understand the point here. Would you please > > > > > elaborate a bit more? > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > len, truesize); > > > > > > > } else { > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > return head_skb; > > > > > > > > > > > > > > err_skb: > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > err_buf: > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > */ > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > - return -ENOMEM; > > > > > > > + if (rq->page_pool) { > > > > > > > + struct page *page; > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > - get_page(alloc_frag->page); > > > > > > > - alloc_frag->offset += len + room; > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > - if (hole < len + room) { > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > - * the current buffer. > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > - */ > > > > > > > - if (!headroom) > > > > > > > - len += hole; > > > > > > > - alloc_frag->offset += hole; > > > > > > > - } > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + if (unlikely(!page)) > > > > > > > + return -ENOMEM; > > > > > > > + buf = (char *)page_address(page); > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > + } else { > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > > > > > Why not simply use a helper like virtnet_page_frag_refill() and add > > > > > > the page_pool allocation logic there? It helps to reduce the > > > > > > changeset. > > > > > > > > > > > > > > > > Sure. Will do that on v2. > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > + get_page(alloc_frag->page); > > > > > > > + alloc_frag->offset += len + room; > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > + if (hole < len + room) { > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > + * the current buffer. > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > + */ > > > > > > > + if (!headroom) > > > > > > > + len += hole; > > > > > > > + alloc_frag->offset += hole; > > > > > > > + } > > > > > > > + } > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > if (err < 0) > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > return err; > > > > > > > } > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > if (err < 0) > > > > > > > return err; > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > + else > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > + NULL); > > > > > > > + > > > > > > > if (err < 0) > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > } > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > break; > > > > > > > } > > > > > > > } > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > switch (sset) { > > > > > > > case ETH_SS_STATS: > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > default: > > > > > > > return -EOPNOTSUPP; > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > +{ > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > + int i; > > > > > > > + > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > + continue; > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > + } > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > +} > > > > > > > + > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > { > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > } > > > > > > > + > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > } > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > } > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > - else if (vi->big_packets) > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > + virt_to_head_page(buf), > > > > > > > + true); > > > > > > > + } else { > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > + } > > > > > > > + } else if (vi->big_packets) { > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > > > > > > Any reason only mergeable were modified but not for small and big? > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > > > Big mode uses the page chain to recycle pages, thus the using of > > > > > "private" of the buffer page. I will take further look into that to > > > > > see if it is better to use page pool in these cases. Thanks! > > > > > > > > > > > > > > > > > > > > > > - else > > > > > > > + } else { > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > + } > > > > > > > } > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > virtnet_free_queues(vi); > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > +{ > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > + > > > > > > > + struct page_pool_params pp_params = { > > > > > > > + .order = 0, > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > + .dev = vdev->dev.parent, > > > > > > > + .offset = 0, > > > > > > > + }; > > > > > > > + > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > + rq->page_pool = NULL; > > > > > > > + } > > > > > > > +} > > > > > > > + > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > * least one full packet? > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > + > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > + else > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > + > > > > > > > } > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > -- > > > > > > > 2.31.1 > > > > > > > > > > > > > > > > >
On Wed, 7 Jun 2023 17:08:59 +0800, Liang Chen <liangchen.linux@gmail.com> wrote: > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > in the normal path. > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > gain is observed in XDP cpumap: > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > DMA map/unmap support. > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > Why off by default? > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > packet size is very small, it reduces the likelihood of skb > > > > coalescing. But such cases are rare. > > > > > > small packets are rare? These workloads are easy to create actually. > > > Pls try and include benchmark with small packet size. > > > > > > > Sure, Thanks! > > Before going ahead and posting v2 patch, I would like to hear more > advice for the cases of small packets. I have done more performance > benchmark with small packets since then. Here is a list of iperf > output, Could you show the commnad line? Thanks > > With PP and PP fragmenting: > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > 223 KBytes > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > 324 KBytes > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > 1.08 MBytes > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > 744 KBytes > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > Without PP: > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > The major factor contributing to the performance drop is the reduction > of skb coalescing. Additionally, without the page pool, small packets > can still benefit from the allocation of 8 continuous pages by > breaking them down into smaller pieces. This effectively reduces the > frequency of page allocation from the buddy system. For instance, the > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > the benefits of using a page pool are limited in such cases. In fact, > without page pool fragmenting enabled, it can even hinder performance > from this perspective. > > Upon further consideration, I tend to believe making page pool the > default option may not be appropriate. As you pointed out, we cannot > simply ignore the performance impact on small packets. Any comments on > this will be much appreciated. > > > Thanks, > Liang > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > --- > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > --- a/drivers/net/virtio_net.c > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > module_param(gso, bool, 0444); > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > + > > > > > > /* FIXME: MTU in config. */ > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > #define GOOD_COPY_LEN 128 > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > /* Chain pages by the private ptr. */ > > > > > > struct page *pages; > > > > > > > > > > > > + /* Page pool */ > > > > > > + struct page_pool *page_pool; > > > > > > + > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > return skb; > > > > > > } > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > +{ > > > > > > + if (rq->page_pool) > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > + else > > > > > > + put_page(page); > > > > > > +} > > > > > > + > > > > > > /* Called from bottom half context */ > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > struct receive_queue *rq, > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > if (page_to_free) > > > > > > - put_page(page_to_free); > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > return skb; > > > > > > } > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > return ret; > > > > > > } > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > { > > > > > > struct skb_shared_info *shinfo; > > > > > > struct page *xdp_page; > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > - put_page(xdp_page); > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > } > > > > > > } > > > > > > } > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > return NULL; > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > + if (rq->page_pool) > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + else > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > + > > > > > > if (!page) > > > > > > return NULL; > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > * is sending packet larger than the MTU. > > > > > > */ > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > - put_page(p); > > > > > > + virtnet_put_page(rq, p); > > > > > > goto err_buf; > > > > > > } > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > page_address(p) + off, buflen); > > > > > > page_off += buflen; > > > > > > - put_page(p); > > > > > > + virtnet_put_page(rq, p); > > > > > > } > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > return page; > > > > > > err_buf: > > > > > > - __free_pages(page, 0); > > > > > > + if (rq->page_pool) > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > + else > > > > > > + __free_pages(page, 0); > > > > > > return NULL; > > > > > > } > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > } > > > > > > stats->bytes += len; > > > > > > page = virt_to_head_page(buf); > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > } > > > > > > } > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > cur_frag_size = truesize; > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > dev->stats.rx_length_errors++; > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > return 0; > > > > > > > > > > > > err: > > > > > > - put_xdp_frags(xdp); > > > > > > + put_xdp_frags(xdp, rq); > > > > > > return -EINVAL; > > > > > > } > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > return NULL; > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > + if (rq->page_pool) > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + else > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > if (!xdp_page) > > > > > > return NULL; > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > - put_page(*page); > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > if (unlikely(!head_skb)) > > > > > > break; > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > return head_skb; > > > > > > > > > > > > case XDP_TX: > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > break; > > > > > > } > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > err_xdp: > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > stats->xdp_drops++; > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > curr_skb = head_skb; > > > > > > > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > + > > > > > > if (unlikely(!curr_skb)) > > > > > > goto err_skb; > > > > > > while (--num_buf) { > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > curr_skb = nskb; > > > > > > head_skb->truesize += nskb->truesize; > > > > > > num_skb_frags = 0; > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > } > > > > > > if (curr_skb != head_skb) { > > > > > > head_skb->data_len += len; > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > } > > > > > > offset = buf - page_address(page); > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > len, truesize); > > > > > > } else { > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > return head_skb; > > > > > > > > > > > > err_skb: > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > err_buf: > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > */ > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > - return -ENOMEM; > > > > > > + if (rq->page_pool) { > > > > > > + struct page *page; > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > - get_page(alloc_frag->page); > > > > > > - alloc_frag->offset += len + room; > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > - if (hole < len + room) { > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > - * the current buffer. > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > - */ > > > > > > - if (!headroom) > > > > > > - len += hole; > > > > > > - alloc_frag->offset += hole; > > > > > > - } > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + if (unlikely(!page)) > > > > > > + return -ENOMEM; > > > > > > + buf = (char *)page_address(page); > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > + } else { > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > + return -ENOMEM; > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > + get_page(alloc_frag->page); > > > > > > + alloc_frag->offset += len + room; > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > + if (hole < len + room) { > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > + * the current buffer. > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > + */ > > > > > > + if (!headroom) > > > > > > + len += hole; > > > > > > + alloc_frag->offset += hole; > > > > > > + } > > > > > > + } > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > if (err < 0) > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > return err; > > > > > > } > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > if (err < 0) > > > > > > return err; > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > + vi->rq[qp_index].page_pool); > > > > > > + else > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > + NULL); > > > > > > + > > > > > > if (err < 0) > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > } > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > break; > > > > > > } > > > > > > } > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > switch (sset) { > > > > > > case ETH_SS_STATS: > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > default: > > > > > > return -EOPNOTSUPP; > > > > > > } > > > > > > } > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > +{ > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > + int i; > > > > > > + > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > + if (!vi->rq[i].page_pool) > > > > > > + continue; > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > + } > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > +} > > > > > > + > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > { > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > } > > > > > > + > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > } > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > + if (vi->rq[i].page_pool) > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > } > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > - else if (vi->big_packets) > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > + if (vi->rq[i].page_pool) { > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > + virt_to_head_page(buf), > > > > > > + true); > > > > > > + } else { > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > + } > > > > > > + } else if (vi->big_packets) { > > > > > > give_pages(&vi->rq[i], buf); > > > > > > - else > > > > > > + } else { > > > > > > put_page(virt_to_head_page(buf)); > > > > > > + } > > > > > > } > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > virtnet_free_queues(vi); > > > > > > } > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > +{ > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > + > > > > > > + struct page_pool_params pp_params = { > > > > > > + .order = 0, > > > > > > + .pool_size = rq->vq->num_max, > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > + .dev = vdev->dev.parent, > > > > > > + .offset = 0, > > > > > > + }; > > > > > > + > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > + rq->page_pool = NULL; > > > > > > + } > > > > > > +} > > > > > > + > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > * least one full packet? > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > + > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > + else > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > + "page pool only support mergeable mode\n"); > > > > > > + > > > > > > } > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > -- > > > > > > 2.31.1 > > > > > > > >
On Wed, Jun 7, 2023 at 5:36 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote: > > On Wed, 7 Jun 2023 17:08:59 +0800, Liang Chen <liangchen.linux@gmail.com> wrote: > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > in the normal path. > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > gain is observed in XDP cpumap: > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > > > Why off by default? > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > packet size is very small, it reduces the likelihood of skb > > > > > coalescing. But such cases are rare. > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > Sure, Thanks! > > > > Before going ahead and posting v2 patch, I would like to hear more > > advice for the cases of small packets. I have done more performance > > benchmark with small packets since then. Here is a list of iperf > > output, > > Could you show the commnad line? > > Thanks > > Sure. iperf3 -c <IP> -i 5 -f g -t 0 -l <packet size> > > > > With PP and PP fragmenting: > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > > 223 KBytes > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > > 324 KBytes > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > > 1.08 MBytes > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > > 744 KBytes > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > > > Without PP: > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > > > > The major factor contributing to the performance drop is the reduction > > of skb coalescing. Additionally, without the page pool, small packets > > can still benefit from the allocation of 8 continuous pages by > > breaking them down into smaller pieces. This effectively reduces the > > frequency of page allocation from the buddy system. For instance, the > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > the benefits of using a page pool are limited in such cases. In fact, > > without page pool fragmenting enabled, it can even hinder performance > > from this perspective. > > > > Upon further consideration, I tend to believe making page pool the > > default option may not be appropriate. As you pointed out, we cannot > > simply ignore the performance impact on small packets. Any comments on > > this will be much appreciated. > > > > > > Thanks, > > Liang > > > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > > > --- > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > module_param(gso, bool, 0444); > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > + > > > > > > > /* FIXME: MTU in config. */ > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > struct page *pages; > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > + struct page_pool *page_pool; > > > > > > > + > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > return skb; > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > +{ > > > > > > > + if (rq->page_pool) > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > + else > > > > > > > + put_page(page); > > > > > > > +} > > > > > > > + > > > > > > > /* Called from bottom half context */ > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > struct receive_queue *rq, > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > if (page_to_free) > > > > > > > - put_page(page_to_free); > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > return skb; > > > > > > > } > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > return ret; > > > > > > > } > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > { > > > > > > > struct skb_shared_info *shinfo; > > > > > > > struct page *xdp_page; > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > - put_page(xdp_page); > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > } > > > > > > > } > > > > > > > } > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > return NULL; > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > + if (rq->page_pool) > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + else > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > + > > > > > > > if (!page) > > > > > > > return NULL; > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > * is sending packet larger than the MTU. > > > > > > > */ > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > - put_page(p); > > > > > > > + virtnet_put_page(rq, p); > > > > > > > goto err_buf; > > > > > > > } > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > page_address(p) + off, buflen); > > > > > > > page_off += buflen; > > > > > > > - put_page(p); > > > > > > > + virtnet_put_page(rq, p); > > > > > > > } > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > return page; > > > > > > > err_buf: > > > > > > > - __free_pages(page, 0); > > > > > > > + if (rq->page_pool) > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > + else > > > > > > > + __free_pages(page, 0); > > > > > > > return NULL; > > > > > > > } > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > } > > > > > > > stats->bytes += len; > > > > > > > page = virt_to_head_page(buf); > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > cur_frag_size = truesize; > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > dev->stats.rx_length_errors++; > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > return 0; > > > > > > > > > > > > > > err: > > > > > > > - put_xdp_frags(xdp); > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > return -EINVAL; > > > > > > > } > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > return NULL; > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > + if (rq->page_pool) > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + else > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > if (!xdp_page) > > > > > > > return NULL; > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > - put_page(*page); > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > if (unlikely(!head_skb)) > > > > > > > break; > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > return head_skb; > > > > > > > > > > > > > > case XDP_TX: > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > break; > > > > > > > } > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > err_xdp: > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > + > > > > > > > if (unlikely(!curr_skb)) > > > > > > > goto err_skb; > > > > > > > while (--num_buf) { > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > curr_skb = nskb; > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > num_skb_frags = 0; > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > } > > > > > > > if (curr_skb != head_skb) { > > > > > > > head_skb->data_len += len; > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > } > > > > > > > offset = buf - page_address(page); > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > len, truesize); > > > > > > > } else { > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > return head_skb; > > > > > > > > > > > > > > err_skb: > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > err_buf: > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > */ > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > - return -ENOMEM; > > > > > > > + if (rq->page_pool) { > > > > > > > + struct page *page; > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > - get_page(alloc_frag->page); > > > > > > > - alloc_frag->offset += len + room; > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > - if (hole < len + room) { > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > - * the current buffer. > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > - */ > > > > > > > - if (!headroom) > > > > > > > - len += hole; > > > > > > > - alloc_frag->offset += hole; > > > > > > > - } > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + if (unlikely(!page)) > > > > > > > + return -ENOMEM; > > > > > > > + buf = (char *)page_address(page); > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > + } else { > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > + get_page(alloc_frag->page); > > > > > > > + alloc_frag->offset += len + room; > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > + if (hole < len + room) { > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > + * the current buffer. > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > + */ > > > > > > > + if (!headroom) > > > > > > > + len += hole; > > > > > > > + alloc_frag->offset += hole; > > > > > > > + } > > > > > > > + } > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > if (err < 0) > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > return err; > > > > > > > } > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > if (err < 0) > > > > > > > return err; > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > + else > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > + NULL); > > > > > > > + > > > > > > > if (err < 0) > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > } > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > break; > > > > > > > } > > > > > > > } > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > switch (sset) { > > > > > > > case ETH_SS_STATS: > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > default: > > > > > > > return -EOPNOTSUPP; > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > +{ > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > + int i; > > > > > > > + > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > + continue; > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > + } > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > +} > > > > > > > + > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > { > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > } > > > > > > > + > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > } > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > } > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > - else if (vi->big_packets) > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > + virt_to_head_page(buf), > > > > > > > + true); > > > > > > > + } else { > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > + } > > > > > > > + } else if (vi->big_packets) { > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > - else > > > > > > > + } else { > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > + } > > > > > > > } > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > virtnet_free_queues(vi); > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > +{ > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > + > > > > > > > + struct page_pool_params pp_params = { > > > > > > > + .order = 0, > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > + .dev = vdev->dev.parent, > > > > > > > + .offset = 0, > > > > > > > + }; > > > > > > > + > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > + rq->page_pool = NULL; > > > > > > > + } > > > > > > > +} > > > > > > > + > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > * least one full packet? > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > + > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > + else > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > + > > > > > > > } > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > -- > > > > > > > 2.31.1 > > > > > > > > > >
On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > in the normal path. > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > gain is observed in XDP cpumap: > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > DMA map/unmap support. > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > Why off by default? > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > packet size is very small, it reduces the likelihood of skb > > > > coalescing. But such cases are rare. > > > > > > small packets are rare? These workloads are easy to create actually. > > > Pls try and include benchmark with small packet size. > > > > > > > Sure, Thanks! > > Before going ahead and posting v2 patch, I would like to hear more > advice for the cases of small packets. I have done more performance > benchmark with small packets since then. Here is a list of iperf > output, > > With PP and PP fragmenting: > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > 223 KBytes > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > 324 KBytes > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > 1.08 MBytes > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > 744 KBytes > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > Without PP: > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > The major factor contributing to the performance drop is the reduction > of skb coalescing. Additionally, without the page pool, small packets > can still benefit from the allocation of 8 continuous pages by > breaking them down into smaller pieces. This effectively reduces the > frequency of page allocation from the buddy system. For instance, the > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > the benefits of using a page pool are limited in such cases. In fact, > without page pool fragmenting enabled, it can even hinder performance > from this perspective. > > Upon further consideration, I tend to believe making page pool the > default option may not be appropriate. As you pointed out, we cannot > simply ignore the performance impact on small packets. Any comments on > this will be much appreciated. > > > Thanks, > Liang So, let's only use page pool for XDP then? > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > --- > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > --- a/drivers/net/virtio_net.c > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > module_param(gso, bool, 0444); > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > + > > > > > > /* FIXME: MTU in config. */ > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > #define GOOD_COPY_LEN 128 > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > /* Chain pages by the private ptr. */ > > > > > > struct page *pages; > > > > > > > > > > > > + /* Page pool */ > > > > > > + struct page_pool *page_pool; > > > > > > + > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > return skb; > > > > > > } > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > +{ > > > > > > + if (rq->page_pool) > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > + else > > > > > > + put_page(page); > > > > > > +} > > > > > > + > > > > > > /* Called from bottom half context */ > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > struct receive_queue *rq, > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > if (page_to_free) > > > > > > - put_page(page_to_free); > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > return skb; > > > > > > } > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > return ret; > > > > > > } > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > { > > > > > > struct skb_shared_info *shinfo; > > > > > > struct page *xdp_page; > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > - put_page(xdp_page); > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > } > > > > > > } > > > > > > } > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > return NULL; > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > + if (rq->page_pool) > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + else > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > + > > > > > > if (!page) > > > > > > return NULL; > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > * is sending packet larger than the MTU. > > > > > > */ > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > - put_page(p); > > > > > > + virtnet_put_page(rq, p); > > > > > > goto err_buf; > > > > > > } > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > page_address(p) + off, buflen); > > > > > > page_off += buflen; > > > > > > - put_page(p); > > > > > > + virtnet_put_page(rq, p); > > > > > > } > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > return page; > > > > > > err_buf: > > > > > > - __free_pages(page, 0); > > > > > > + if (rq->page_pool) > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > + else > > > > > > + __free_pages(page, 0); > > > > > > return NULL; > > > > > > } > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > } > > > > > > stats->bytes += len; > > > > > > page = virt_to_head_page(buf); > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > } > > > > > > } > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > cur_frag_size = truesize; > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > dev->stats.rx_length_errors++; > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > return 0; > > > > > > > > > > > > err: > > > > > > - put_xdp_frags(xdp); > > > > > > + put_xdp_frags(xdp, rq); > > > > > > return -EINVAL; > > > > > > } > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > return NULL; > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > + if (rq->page_pool) > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + else > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > if (!xdp_page) > > > > > > return NULL; > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > - put_page(*page); > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > if (unlikely(!head_skb)) > > > > > > break; > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > return head_skb; > > > > > > > > > > > > case XDP_TX: > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > break; > > > > > > } > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > err_xdp: > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > stats->xdp_drops++; > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > curr_skb = head_skb; > > > > > > > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > + > > > > > > if (unlikely(!curr_skb)) > > > > > > goto err_skb; > > > > > > while (--num_buf) { > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > curr_skb = nskb; > > > > > > head_skb->truesize += nskb->truesize; > > > > > > num_skb_frags = 0; > > > > > > + if (rq->page_pool) > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > } > > > > > > if (curr_skb != head_skb) { > > > > > > head_skb->data_len += len; > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > } > > > > > > offset = buf - page_address(page); > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > len, truesize); > > > > > > } else { > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > return head_skb; > > > > > > > > > > > > err_skb: > > > > > > - put_page(page); > > > > > > + virtnet_put_page(rq, page); > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > err_buf: > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > */ > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > - return -ENOMEM; > > > > > > + if (rq->page_pool) { > > > > > > + struct page *page; > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > - get_page(alloc_frag->page); > > > > > > - alloc_frag->offset += len + room; > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > - if (hole < len + room) { > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > - * the current buffer. > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > - */ > > > > > > - if (!headroom) > > > > > > - len += hole; > > > > > > - alloc_frag->offset += hole; > > > > > > - } > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > + if (unlikely(!page)) > > > > > > + return -ENOMEM; > > > > > > + buf = (char *)page_address(page); > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > + } else { > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > + return -ENOMEM; > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > + get_page(alloc_frag->page); > > > > > > + alloc_frag->offset += len + room; > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > + if (hole < len + room) { > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > + * the current buffer. > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > + */ > > > > > > + if (!headroom) > > > > > > + len += hole; > > > > > > + alloc_frag->offset += hole; > > > > > > + } > > > > > > + } > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > if (err < 0) > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > return err; > > > > > > } > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > if (err < 0) > > > > > > return err; > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > + vi->rq[qp_index].page_pool); > > > > > > + else > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > + NULL); > > > > > > + > > > > > > if (err < 0) > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > } > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > break; > > > > > > } > > > > > > } > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > switch (sset) { > > > > > > case ETH_SS_STATS: > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > default: > > > > > > return -EOPNOTSUPP; > > > > > > } > > > > > > } > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > +{ > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > + int i; > > > > > > + > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > + if (!vi->rq[i].page_pool) > > > > > > + continue; > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > + } > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > +} > > > > > > + > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > { > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > } > > > > > > + > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > } > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > + if (vi->rq[i].page_pool) > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > } > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > - else if (vi->big_packets) > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > + if (vi->rq[i].page_pool) { > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > + virt_to_head_page(buf), > > > > > > + true); > > > > > > + } else { > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > + } > > > > > > + } else if (vi->big_packets) { > > > > > > give_pages(&vi->rq[i], buf); > > > > > > - else > > > > > > + } else { > > > > > > put_page(virt_to_head_page(buf)); > > > > > > + } > > > > > > } > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > virtnet_free_queues(vi); > > > > > > } > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > +{ > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > + > > > > > > + struct page_pool_params pp_params = { > > > > > > + .order = 0, > > > > > > + .pool_size = rq->vq->num_max, > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > + .dev = vdev->dev.parent, > > > > > > + .offset = 0, > > > > > > + }; > > > > > > + > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > + rq->page_pool = NULL; > > > > > > + } > > > > > > +} > > > > > > + > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > * least one full packet? > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > + > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > + else > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > + "page pool only support mergeable mode\n"); > > > > > > + > > > > > > } > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > -- > > > > > > 2.31.1 > > > > > > > >
On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > in the normal path. > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > gain is observed in XDP cpumap: > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > > > Why off by default? > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > packet size is very small, it reduces the likelihood of skb > > > > > coalescing. But such cases are rare. > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > Sure, Thanks! > > > > Before going ahead and posting v2 patch, I would like to hear more > > advice for the cases of small packets. I have done more performance > > benchmark with small packets since then. Here is a list of iperf > > output, > > > > With PP and PP fragmenting: > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > > 223 KBytes > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > > 324 KBytes > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > > 1.08 MBytes > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > > 744 KBytes > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes Note that virtio-net driver is lacking things like BQL and others, so it might suffer from buffer bloat for TCP performance. Would you mind to measure with e.g using testpmd on the vhost to see the rx PPS? > > > > Without PP: > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > > > > The major factor contributing to the performance drop is the reduction > > of skb coalescing. Additionally, without the page pool, small packets > > can still benefit from the allocation of 8 continuous pages by > > breaking them down into smaller pieces. This effectively reduces the > > frequency of page allocation from the buddy system. For instance, the > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > the benefits of using a page pool are limited in such cases. I wonder if we can improve page pool in this case anyhow. > In fact, > > without page pool fragmenting enabled, it can even hinder performance > > from this perspective. > > > > Upon further consideration, I tend to believe making page pool the > > default option may not be appropriate. As you pointed out, we cannot > > simply ignore the performance impact on small packets. Any comments on > > this will be much appreciated. > > > > > > Thanks, > > Liang > > > So, let's only use page pool for XDP then? +1 We can start from this. Thanks > > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > > > --- > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > module_param(gso, bool, 0444); > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > + > > > > > > > /* FIXME: MTU in config. */ > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > struct page *pages; > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > + struct page_pool *page_pool; > > > > > > > + > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > return skb; > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > +{ > > > > > > > + if (rq->page_pool) > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > + else > > > > > > > + put_page(page); > > > > > > > +} > > > > > > > + > > > > > > > /* Called from bottom half context */ > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > struct receive_queue *rq, > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > if (page_to_free) > > > > > > > - put_page(page_to_free); > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > return skb; > > > > > > > } > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > return ret; > > > > > > > } > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > { > > > > > > > struct skb_shared_info *shinfo; > > > > > > > struct page *xdp_page; > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > - put_page(xdp_page); > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > } > > > > > > > } > > > > > > > } > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > return NULL; > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > + if (rq->page_pool) > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + else > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > + > > > > > > > if (!page) > > > > > > > return NULL; > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > * is sending packet larger than the MTU. > > > > > > > */ > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > - put_page(p); > > > > > > > + virtnet_put_page(rq, p); > > > > > > > goto err_buf; > > > > > > > } > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > page_address(p) + off, buflen); > > > > > > > page_off += buflen; > > > > > > > - put_page(p); > > > > > > > + virtnet_put_page(rq, p); > > > > > > > } > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > return page; > > > > > > > err_buf: > > > > > > > - __free_pages(page, 0); > > > > > > > + if (rq->page_pool) > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > + else > > > > > > > + __free_pages(page, 0); > > > > > > > return NULL; > > > > > > > } > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > } > > > > > > > stats->bytes += len; > > > > > > > page = virt_to_head_page(buf); > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > cur_frag_size = truesize; > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > dev->stats.rx_length_errors++; > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > return 0; > > > > > > > > > > > > > > err: > > > > > > > - put_xdp_frags(xdp); > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > return -EINVAL; > > > > > > > } > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > return NULL; > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > + if (rq->page_pool) > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + else > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > if (!xdp_page) > > > > > > > return NULL; > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > - put_page(*page); > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > if (unlikely(!head_skb)) > > > > > > > break; > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > return head_skb; > > > > > > > > > > > > > > case XDP_TX: > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > break; > > > > > > > } > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > err_xdp: > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > + > > > > > > > if (unlikely(!curr_skb)) > > > > > > > goto err_skb; > > > > > > > while (--num_buf) { > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > curr_skb = nskb; > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > num_skb_frags = 0; > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > } > > > > > > > if (curr_skb != head_skb) { > > > > > > > head_skb->data_len += len; > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > } > > > > > > > offset = buf - page_address(page); > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > len, truesize); > > > > > > > } else { > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > return head_skb; > > > > > > > > > > > > > > err_skb: > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > err_buf: > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > */ > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > - return -ENOMEM; > > > > > > > + if (rq->page_pool) { > > > > > > > + struct page *page; > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > - get_page(alloc_frag->page); > > > > > > > - alloc_frag->offset += len + room; > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > - if (hole < len + room) { > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > - * the current buffer. > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > - */ > > > > > > > - if (!headroom) > > > > > > > - len += hole; > > > > > > > - alloc_frag->offset += hole; > > > > > > > - } > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + if (unlikely(!page)) > > > > > > > + return -ENOMEM; > > > > > > > + buf = (char *)page_address(page); > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > + } else { > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > + get_page(alloc_frag->page); > > > > > > > + alloc_frag->offset += len + room; > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > + if (hole < len + room) { > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > + * the current buffer. > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > + */ > > > > > > > + if (!headroom) > > > > > > > + len += hole; > > > > > > > + alloc_frag->offset += hole; > > > > > > > + } > > > > > > > + } > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > if (err < 0) > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > return err; > > > > > > > } > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > if (err < 0) > > > > > > > return err; > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > + else > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > + NULL); > > > > > > > + > > > > > > > if (err < 0) > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > } > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > break; > > > > > > > } > > > > > > > } > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > switch (sset) { > > > > > > > case ETH_SS_STATS: > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > default: > > > > > > > return -EOPNOTSUPP; > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > +{ > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > + int i; > > > > > > > + > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > + continue; > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > + } > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > +} > > > > > > > + > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > { > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > } > > > > > > > + > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > } > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > } > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > - else if (vi->big_packets) > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > + virt_to_head_page(buf), > > > > > > > + true); > > > > > > > + } else { > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > + } > > > > > > > + } else if (vi->big_packets) { > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > - else > > > > > > > + } else { > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > + } > > > > > > > } > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > virtnet_free_queues(vi); > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > +{ > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > + > > > > > > > + struct page_pool_params pp_params = { > > > > > > > + .order = 0, > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > + .dev = vdev->dev.parent, > > > > > > > + .offset = 0, > > > > > > > + }; > > > > > > > + > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > + rq->page_pool = NULL; > > > > > > > + } > > > > > > > +} > > > > > > > + > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > * least one full packet? > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > + > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > + else > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > + > > > > > > > } > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > -- > > > > > > > 2.31.1 > > > > > > > > > > >
On Thu, 8 Jun 2023 08:38:14 +0800, Jason Wang <jasowang@redhat.com> wrote: > On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > > in the normal path. > > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > > gain is observed in XDP cpumap: > > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > > > > > Why off by default? > > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > > packet size is very small, it reduces the likelihood of skb > > > > > > coalescing. But such cases are rare. > > > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > > > > Sure, Thanks! > > > > > > Before going ahead and posting v2 patch, I would like to hear more > > > advice for the cases of small packets. I have done more performance > > > benchmark with small packets since then. Here is a list of iperf > > > output, > > > > > > With PP and PP fragmenting: > > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > > > 223 KBytes > > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > > > 324 KBytes > > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > > > 1.08 MBytes > > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > > > 744 KBytes > > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > Note that virtio-net driver is lacking things like BQL and others, so > it might suffer from buffer bloat for TCP performance. Would you mind > to measure with e.g using testpmd on the vhost to see the rx PPS? > > > > > > > Without PP: > > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > > > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > > > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > > > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > > > > > > > The major factor contributing to the performance drop is the reduction > > > of skb coalescing. Additionally, without the page pool, small packets > > > can still benefit from the allocation of 8 continuous pages by > > > breaking them down into smaller pieces. This effectively reduces the > > > frequency of page allocation from the buddy system. For instance, the > > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > > the benefits of using a page pool are limited in such cases. > > I wonder if we can improve page pool in this case anyhow. > > > In fact, > > > without page pool fragmenting enabled, it can even hinder performance > > > from this perspective. > > > > > > Upon further consideration, I tend to believe making page pool the > > > default option may not be appropriate. As you pointed out, we cannot > > > simply ignore the performance impact on small packets. Any comments on > > > this will be much appreciated. > > > > > > > > > Thanks, > > > Liang > > > > > > So, let's only use page pool for XDP then? > > +1 +1 Thanks. > > We can start from this. > > Thanks > > > > > > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > > > > > --- > > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > > module_param(gso, bool, 0444); > > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > > + > > > > > > > > /* FIXME: MTU in config. */ > > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > > struct page *pages; > > > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > > + struct page_pool *page_pool; > > > > > > > > + > > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > > return skb; > > > > > > > > } > > > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > > +{ > > > > > > > > + if (rq->page_pool) > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > + else > > > > > > > > + put_page(page); > > > > > > > > +} > > > > > > > > + > > > > > > > > /* Called from bottom half context */ > > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > struct receive_queue *rq, > > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > > if (page_to_free) > > > > > > > > - put_page(page_to_free); > > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > > > return skb; > > > > > > > > } > > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > > return ret; > > > > > > > > } > > > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > > { > > > > > > > > struct skb_shared_info *shinfo; > > > > > > > > struct page *xdp_page; > > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > > - put_page(xdp_page); > > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > > } > > > > > > > > } > > > > > > > > } > > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > > return NULL; > > > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > > + if (rq->page_pool) > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > + else > > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > > + > > > > > > > > if (!page) > > > > > > > > return NULL; > > > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > * is sending packet larger than the MTU. > > > > > > > > */ > > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > > - put_page(p); > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > goto err_buf; > > > > > > > > } > > > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > > page_address(p) + off, buflen); > > > > > > > > page_off += buflen; > > > > > > > > - put_page(p); > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > } > > > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > > return page; > > > > > > > > err_buf: > > > > > > > > - __free_pages(page, 0); > > > > > > > > + if (rq->page_pool) > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > + else > > > > > > > > + __free_pages(page, 0); > > > > > > > > return NULL; > > > > > > > > } > > > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > > } > > > > > > > > stats->bytes += len; > > > > > > > > page = virt_to_head_page(buf); > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > } > > > > > > > > } > > > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > cur_frag_size = truesize; > > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > > dev->stats.rx_length_errors++; > > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > return 0; > > > > > > > > > > > > > > > > err: > > > > > > > > - put_xdp_frags(xdp); > > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > > return -EINVAL; > > > > > > > > } > > > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > > return NULL; > > > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > + if (rq->page_pool) > > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > + else > > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > if (!xdp_page) > > > > > > > > return NULL; > > > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > > > - put_page(*page); > > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > > if (unlikely(!head_skb)) > > > > > > > > break; > > > > > > > > + if (rq->page_pool) > > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > case XDP_TX: > > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > break; > > > > > > > > } > > > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > > > err_xdp: > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > + > > > > > > > > if (unlikely(!curr_skb)) > > > > > > > > goto err_skb; > > > > > > > > while (--num_buf) { > > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > curr_skb = nskb; > > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > > num_skb_frags = 0; > > > > > > > > + if (rq->page_pool) > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > } > > > > > > > > if (curr_skb != head_skb) { > > > > > > > > head_skb->data_len += len; > > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > } > > > > > > > > offset = buf - page_address(page); > > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > > len, truesize); > > > > > > > > } else { > > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > err_skb: > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > err_buf: > > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > > */ > > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > - return -ENOMEM; > > > > > > > > + if (rq->page_pool) { > > > > > > > > + struct page *page; > > > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > - get_page(alloc_frag->page); > > > > > > > > - alloc_frag->offset += len + room; > > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > - if (hole < len + room) { > > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > > - * the current buffer. > > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > - */ > > > > > > > > - if (!headroom) > > > > > > > > - len += hole; > > > > > > > > - alloc_frag->offset += hole; > > > > > > > > - } > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > + if (unlikely(!page)) > > > > > > > > + return -ENOMEM; > > > > > > > > + buf = (char *)page_address(page); > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > + } else { > > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > + get_page(alloc_frag->page); > > > > > > > > + alloc_frag->offset += len + room; > > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > + if (hole < len + room) { > > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > > + * the current buffer. > > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > + */ > > > > > > > > + if (!headroom) > > > > > > > > + len += hole; > > > > > > > > + alloc_frag->offset += hole; > > > > > > > > + } > > > > > > > > + } > > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > > if (err < 0) > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > > > return err; > > > > > > > > } > > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > > if (err < 0) > > > > > > > > return err; > > > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > > + else > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > > + NULL); > > > > > > > > + > > > > > > > > if (err < 0) > > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > > } > > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > > break; > > > > > > > > } > > > > > > > > } > > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > > switch (sset) { > > > > > > > > case ETH_SS_STATS: > > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > > default: > > > > > > > > return -EOPNOTSUPP; > > > > > > > > } > > > > > > > > } > > > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > > +{ > > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > > + int i; > > > > > > > > + > > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > > + continue; > > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > > + } > > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > > +} > > > > > > > > + > > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > > { > > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > > } > > > > > > > > + > > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > > } > > > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > > } > > > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > - else if (vi->big_packets) > > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > > + virt_to_head_page(buf), > > > > > > > > + true); > > > > > > > > + } else { > > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > > + } > > > > > > > > + } else if (vi->big_packets) { > > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > > - else > > > > > > > > + } else { > > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > > + } > > > > > > > > } > > > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > > virtnet_free_queues(vi); > > > > > > > > } > > > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > > +{ > > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > > + > > > > > > > > + struct page_pool_params pp_params = { > > > > > > > > + .order = 0, > > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > > + .dev = vdev->dev.parent, > > > > > > > > + .offset = 0, > > > > > > > > + }; > > > > > > > > + > > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > > + rq->page_pool = NULL; > > > > > > > > + } > > > > > > > > +} > > > > > > > > + > > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > > * least one full packet? > > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > > + > > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > > + else > > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > > + > > > > > > > > } > > > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > > -- > > > > > > > > 2.31.1 > > > > > > > > > > > > > > >
On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > in the normal path. > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > gain is observed in XDP cpumap: > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > > > Why off by default? > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > packet size is very small, it reduces the likelihood of skb > > > > > coalescing. But such cases are rare. > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > Sure, Thanks! > > > > Before going ahead and posting v2 patch, I would like to hear more > > advice for the cases of small packets. I have done more performance > > benchmark with small packets since then. Here is a list of iperf > > output, > > > > With PP and PP fragmenting: > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > > 223 KBytes > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > > 324 KBytes > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > > 1.08 MBytes > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > > 744 KBytes > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > > > Without PP: > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > > > > The major factor contributing to the performance drop is the reduction > > of skb coalescing. Additionally, without the page pool, small packets > > can still benefit from the allocation of 8 continuous pages by > > breaking them down into smaller pieces. This effectively reduces the > > frequency of page allocation from the buddy system. For instance, the > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > the benefits of using a page pool are limited in such cases. In fact, > > without page pool fragmenting enabled, it can even hinder performance > > from this perspective. > > > > Upon further consideration, I tend to believe making page pool the > > default option may not be appropriate. As you pointed out, we cannot > > simply ignore the performance impact on small packets. Any comments on > > this will be much appreciated. > > > > > > Thanks, > > Liang > > > So, let's only use page pool for XDP then? > Sure. We will prepare v2 for xdp only, and come back later for the normal path when skb coalecsing works better with page pool. > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > > > --- > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > module_param(gso, bool, 0444); > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > + > > > > > > > /* FIXME: MTU in config. */ > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > struct page *pages; > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > + struct page_pool *page_pool; > > > > > > > + > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > return skb; > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > +{ > > > > > > > + if (rq->page_pool) > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > + else > > > > > > > + put_page(page); > > > > > > > +} > > > > > > > + > > > > > > > /* Called from bottom half context */ > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > struct receive_queue *rq, > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > if (page_to_free) > > > > > > > - put_page(page_to_free); > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > return skb; > > > > > > > } > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > return ret; > > > > > > > } > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > { > > > > > > > struct skb_shared_info *shinfo; > > > > > > > struct page *xdp_page; > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > - put_page(xdp_page); > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > } > > > > > > > } > > > > > > > } > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > return NULL; > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > + if (rq->page_pool) > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + else > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > + > > > > > > > if (!page) > > > > > > > return NULL; > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > * is sending packet larger than the MTU. > > > > > > > */ > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > - put_page(p); > > > > > > > + virtnet_put_page(rq, p); > > > > > > > goto err_buf; > > > > > > > } > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > page_address(p) + off, buflen); > > > > > > > page_off += buflen; > > > > > > > - put_page(p); > > > > > > > + virtnet_put_page(rq, p); > > > > > > > } > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > return page; > > > > > > > err_buf: > > > > > > > - __free_pages(page, 0); > > > > > > > + if (rq->page_pool) > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > + else > > > > > > > + __free_pages(page, 0); > > > > > > > return NULL; > > > > > > > } > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > } > > > > > > > stats->bytes += len; > > > > > > > page = virt_to_head_page(buf); > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > cur_frag_size = truesize; > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > dev->stats.rx_length_errors++; > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > return 0; > > > > > > > > > > > > > > err: > > > > > > > - put_xdp_frags(xdp); > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > return -EINVAL; > > > > > > > } > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > return NULL; > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > + if (rq->page_pool) > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + else > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > if (!xdp_page) > > > > > > > return NULL; > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > - put_page(*page); > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > if (unlikely(!head_skb)) > > > > > > > break; > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > return head_skb; > > > > > > > > > > > > > > case XDP_TX: > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > break; > > > > > > > } > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > err_xdp: > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > + > > > > > > > if (unlikely(!curr_skb)) > > > > > > > goto err_skb; > > > > > > > while (--num_buf) { > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > curr_skb = nskb; > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > num_skb_frags = 0; > > > > > > > + if (rq->page_pool) > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > } > > > > > > > if (curr_skb != head_skb) { > > > > > > > head_skb->data_len += len; > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > } > > > > > > > offset = buf - page_address(page); > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > len, truesize); > > > > > > > } else { > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > return head_skb; > > > > > > > > > > > > > > err_skb: > > > > > > > - put_page(page); > > > > > > > + virtnet_put_page(rq, page); > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > err_buf: > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > */ > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > - return -ENOMEM; > > > > > > > + if (rq->page_pool) { > > > > > > > + struct page *page; > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > - get_page(alloc_frag->page); > > > > > > > - alloc_frag->offset += len + room; > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > - if (hole < len + room) { > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > - * the current buffer. > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > - */ > > > > > > > - if (!headroom) > > > > > > > - len += hole; > > > > > > > - alloc_frag->offset += hole; > > > > > > > - } > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > + if (unlikely(!page)) > > > > > > > + return -ENOMEM; > > > > > > > + buf = (char *)page_address(page); > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > + } else { > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > + get_page(alloc_frag->page); > > > > > > > + alloc_frag->offset += len + room; > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > + if (hole < len + room) { > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > + * the current buffer. > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > + */ > > > > > > > + if (!headroom) > > > > > > > + len += hole; > > > > > > > + alloc_frag->offset += hole; > > > > > > > + } > > > > > > > + } > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > if (err < 0) > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > return err; > > > > > > > } > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > if (err < 0) > > > > > > > return err; > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > + else > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > + NULL); > > > > > > > + > > > > > > > if (err < 0) > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > } > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > break; > > > > > > > } > > > > > > > } > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > switch (sset) { > > > > > > > case ETH_SS_STATS: > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > default: > > > > > > > return -EOPNOTSUPP; > > > > > > > } > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > +{ > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > + int i; > > > > > > > + > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > + continue; > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > + } > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > +} > > > > > > > + > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > { > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > } > > > > > > > + > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > } > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > } > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > - else if (vi->big_packets) > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > + virt_to_head_page(buf), > > > > > > > + true); > > > > > > > + } else { > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > + } > > > > > > > + } else if (vi->big_packets) { > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > - else > > > > > > > + } else { > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > + } > > > > > > > } > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > virtnet_free_queues(vi); > > > > > > > } > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > +{ > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > + > > > > > > > + struct page_pool_params pp_params = { > > > > > > > + .order = 0, > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > + .dev = vdev->dev.parent, > > > > > > > + .offset = 0, > > > > > > > + }; > > > > > > > + > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > + rq->page_pool = NULL; > > > > > > > + } > > > > > > > +} > > > > > > > + > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > * least one full packet? > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > + > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > + else > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > + > > > > > > > } > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > -- > > > > > > > 2.31.1 > > > > > > > > > > >
On Thu, Jun 8, 2023 at 8:38 AM Jason Wang <jasowang@redhat.com> wrote: > > On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > > in the normal path. > > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > > gain is observed in XDP cpumap: > > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > > > > > Why off by default? > > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > > packet size is very small, it reduces the likelihood of skb > > > > > > coalescing. But such cases are rare. > > > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > > > > Sure, Thanks! > > > > > > Before going ahead and posting v2 patch, I would like to hear more > > > advice for the cases of small packets. I have done more performance > > > benchmark with small packets since then. Here is a list of iperf > > > output, > > > > > > With PP and PP fragmenting: > > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > > > 223 KBytes > > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > > > 324 KBytes > > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > > > 1.08 MBytes > > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > > > 744 KBytes > > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > Note that virtio-net driver is lacking things like BQL and others, so > it might suffer from buffer bloat for TCP performance. Would you mind > to measure with e.g using testpmd on the vhost to see the rx PPS? > No problem. Before we proceed to measure with testpmd, could you please take a look at the PPS measurements we obtained previously and see if they are sufficient? Though we will only utilize page pool for xdp on v2. netperf -H 192.168.124.197 -p 4444 -t UDP_STREAM -l 0 -- -m $((1)) with page pool: 1. Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil Average: enp8s0 655092.27 0.35 27508.77 0.03 0.00 0.00 0.00 0.00 2. Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil Average: enp8s0 654749.87 0.63 27494.42 0.05 0.00 0.00 0.00 0.00 3. Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil Average: enp8s0 654230.40 0.10 27472.57 0.01 0.00 0.00 0.00 0.00 4. Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil Average: enp8s0 656661.33 0.15 27574.65 0.01 0.00 0.00 0.00 0.00 without page pool: 1. Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil Average: enp8s0 646515.20 0.47 27148.60 0.04 0.00 0.00 0.00 0.00 2. Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil Average: enp8s0 653874.13 0.18 27457.61 0.02 0.00 0.00 0.00 0.00 3. Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil Average: enp8s0 647246.93 0.15 27179.32 0.01 0.00 0.00 0.00 0.00 4. Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil Average: enp8s0 650625.07 0.27 27321.18 0.02 0.00 0.00 0.00 0.00 (655092+654749+654230+656661)/(646515+653874+647246+650625) = 1.00864886500966031113 On average it gives around 0.8% increase in PPS, and this figure can be reproduced consistently. > > > > > > Without PP: > > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > > > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > > > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > > > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > > > > > > > The major factor contributing to the performance drop is the reduction > > > of skb coalescing. Additionally, without the page pool, small packets > > > can still benefit from the allocation of 8 continuous pages by > > > breaking them down into smaller pieces. This effectively reduces the > > > frequency of page allocation from the buddy system. For instance, the > > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > > the benefits of using a page pool are limited in such cases. > > I wonder if we can improve page pool in this case anyhow. > We would like to make the effort to enhance skb coalecsing to be more friendly with page pool buffers. But that involves modifications to some core data structure of mm. > > In fact, > > > without page pool fragmenting enabled, it can even hinder performance > > > from this perspective. > > > > > > Upon further consideration, I tend to believe making page pool the > > > default option may not be appropriate. As you pointed out, we cannot > > > simply ignore the performance impact on small packets. Any comments on > > > this will be much appreciated. > > > > > > > > > Thanks, > > > Liang > > > > > > So, let's only use page pool for XDP then? > > +1 > > We can start from this. > > Thanks > > > > > > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > > > > > --- > > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > > module_param(gso, bool, 0444); > > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > > + > > > > > > > > /* FIXME: MTU in config. */ > > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > > struct page *pages; > > > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > > + struct page_pool *page_pool; > > > > > > > > + > > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > > return skb; > > > > > > > > } > > > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > > +{ > > > > > > > > + if (rq->page_pool) > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > + else > > > > > > > > + put_page(page); > > > > > > > > +} > > > > > > > > + > > > > > > > > /* Called from bottom half context */ > > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > struct receive_queue *rq, > > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > > if (page_to_free) > > > > > > > > - put_page(page_to_free); > > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > > > return skb; > > > > > > > > } > > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > > return ret; > > > > > > > > } > > > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > > { > > > > > > > > struct skb_shared_info *shinfo; > > > > > > > > struct page *xdp_page; > > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > > - put_page(xdp_page); > > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > > } > > > > > > > > } > > > > > > > > } > > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > > return NULL; > > > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > > + if (rq->page_pool) > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > + else > > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > > + > > > > > > > > if (!page) > > > > > > > > return NULL; > > > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > * is sending packet larger than the MTU. > > > > > > > > */ > > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > > - put_page(p); > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > goto err_buf; > > > > > > > > } > > > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > > page_address(p) + off, buflen); > > > > > > > > page_off += buflen; > > > > > > > > - put_page(p); > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > } > > > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > > return page; > > > > > > > > err_buf: > > > > > > > > - __free_pages(page, 0); > > > > > > > > + if (rq->page_pool) > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > + else > > > > > > > > + __free_pages(page, 0); > > > > > > > > return NULL; > > > > > > > > } > > > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > > } > > > > > > > > stats->bytes += len; > > > > > > > > page = virt_to_head_page(buf); > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > } > > > > > > > > } > > > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > cur_frag_size = truesize; > > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > > dev->stats.rx_length_errors++; > > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > return 0; > > > > > > > > > > > > > > > > err: > > > > > > > > - put_xdp_frags(xdp); > > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > > return -EINVAL; > > > > > > > > } > > > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > > return NULL; > > > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > + if (rq->page_pool) > > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > + else > > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > if (!xdp_page) > > > > > > > > return NULL; > > > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > > > - put_page(*page); > > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > > if (unlikely(!head_skb)) > > > > > > > > break; > > > > > > > > + if (rq->page_pool) > > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > case XDP_TX: > > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > break; > > > > > > > > } > > > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > > > err_xdp: > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > + > > > > > > > > if (unlikely(!curr_skb)) > > > > > > > > goto err_skb; > > > > > > > > while (--num_buf) { > > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > curr_skb = nskb; > > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > > num_skb_frags = 0; > > > > > > > > + if (rq->page_pool) > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > } > > > > > > > > if (curr_skb != head_skb) { > > > > > > > > head_skb->data_len += len; > > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > } > > > > > > > > offset = buf - page_address(page); > > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > > len, truesize); > > > > > > > > } else { > > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > err_skb: > > > > > > > > - put_page(page); > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > err_buf: > > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > > */ > > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > - return -ENOMEM; > > > > > > > > + if (rq->page_pool) { > > > > > > > > + struct page *page; > > > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > - get_page(alloc_frag->page); > > > > > > > > - alloc_frag->offset += len + room; > > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > - if (hole < len + room) { > > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > > - * the current buffer. > > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > - */ > > > > > > > > - if (!headroom) > > > > > > > > - len += hole; > > > > > > > > - alloc_frag->offset += hole; > > > > > > > > - } > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > + if (unlikely(!page)) > > > > > > > > + return -ENOMEM; > > > > > > > > + buf = (char *)page_address(page); > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > + } else { > > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > + get_page(alloc_frag->page); > > > > > > > > + alloc_frag->offset += len + room; > > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > + if (hole < len + room) { > > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > > + * the current buffer. > > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > + */ > > > > > > > > + if (!headroom) > > > > > > > > + len += hole; > > > > > > > > + alloc_frag->offset += hole; > > > > > > > > + } > > > > > > > > + } > > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > > if (err < 0) > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > > > return err; > > > > > > > > } > > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > > if (err < 0) > > > > > > > > return err; > > > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > > + else > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > > + NULL); > > > > > > > > + > > > > > > > > if (err < 0) > > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > > } > > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > > break; > > > > > > > > } > > > > > > > > } > > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > > switch (sset) { > > > > > > > > case ETH_SS_STATS: > > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > > default: > > > > > > > > return -EOPNOTSUPP; > > > > > > > > } > > > > > > > > } > > > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > > +{ > > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > > + int i; > > > > > > > > + > > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > > + continue; > > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > > + } > > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > > +} > > > > > > > > + > > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > > { > > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > > } > > > > > > > > + > > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > > } > > > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > > } > > > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > - else if (vi->big_packets) > > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > > + virt_to_head_page(buf), > > > > > > > > + true); > > > > > > > > + } else { > > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > > + } > > > > > > > > + } else if (vi->big_packets) { > > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > > - else > > > > > > > > + } else { > > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > > + } > > > > > > > > } > > > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > > virtnet_free_queues(vi); > > > > > > > > } > > > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > > +{ > > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > > + > > > > > > > > + struct page_pool_params pp_params = { > > > > > > > > + .order = 0, > > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > > + .dev = vdev->dev.parent, > > > > > > > > + .offset = 0, > > > > > > > > + }; > > > > > > > > + > > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > > + rq->page_pool = NULL; > > > > > > > > + } > > > > > > > > +} > > > > > > > > + > > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > > * least one full packet? > > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > > + > > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > > + else > > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > > + > > > > > > > > } > > > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > > -- > > > > > > > > 2.31.1 > > > > > > > > > > > > > > >
On Fri, Jun 9, 2023 at 10:57 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > On Thu, Jun 8, 2023 at 8:38 AM Jason Wang <jasowang@redhat.com> wrote: > > > > On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > > > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > > > in the normal path. > > > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > > > gain is observed in XDP cpumap: > > > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > > > > > > > Why off by default? > > > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > > > packet size is very small, it reduces the likelihood of skb > > > > > > > coalescing. But such cases are rare. > > > > > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > > > > > > > Sure, Thanks! > > > > > > > > Before going ahead and posting v2 patch, I would like to hear more > > > > advice for the cases of small packets. I have done more performance > > > > benchmark with small packets since then. Here is a list of iperf > > > > output, > > > > > > > > With PP and PP fragmenting: > > > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > > > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > > > > 223 KBytes > > > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > > > > 324 KBytes > > > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > > > > 1.08 MBytes > > > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > > > > 744 KBytes > > > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > > > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > > > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > > > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > > > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > > > Note that virtio-net driver is lacking things like BQL and others, so > > it might suffer from buffer bloat for TCP performance. Would you mind > > to measure with e.g using testpmd on the vhost to see the rx PPS? > > > > No problem. Before we proceed to measure with testpmd, could you > please take a look at the PPS measurements we obtained previously and > see if they are sufficient? Though we will only utilize page pool for > xdp on v2. > > netperf -H 192.168.124.197 -p 4444 -t UDP_STREAM -l 0 -- -m $((1)) > > with page pool: > 1. > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > rxcmp/s txcmp/s rxmcst/s %ifutil > Average: enp8s0 655092.27 0.35 27508.77 0.03 > 0.00 0.00 0.00 0.00 > 2. > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > rxcmp/s txcmp/s rxmcst/s %ifutil > Average: enp8s0 654749.87 0.63 27494.42 0.05 > 0.00 0.00 0.00 0.00 > 3. > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > rxcmp/s txcmp/s rxmcst/s %ifutil > Average: enp8s0 654230.40 0.10 27472.57 0.01 > 0.00 0.00 0.00 0.00 > 4. > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > rxcmp/s txcmp/s rxmcst/s %ifutil > Average: enp8s0 656661.33 0.15 27574.65 0.01 > 0.00 0.00 0.00 0.00 > > > without page pool: > 1. > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > rxcmp/s txcmp/s rxmcst/s %ifutil > Average: enp8s0 646515.20 0.47 27148.60 0.04 > 0.00 0.00 0.00 0.00 > 2. > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > rxcmp/s txcmp/s rxmcst/s %ifutil > Average: enp8s0 653874.13 0.18 27457.61 0.02 > 0.00 0.00 0.00 0.00 > 3. > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > rxcmp/s txcmp/s rxmcst/s %ifutil > Average: enp8s0 647246.93 0.15 27179.32 0.01 > 0.00 0.00 0.00 0.00 > 4. > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > rxcmp/s txcmp/s rxmcst/s %ifutil > Average: enp8s0 650625.07 0.27 27321.18 0.02 > 0.00 0.00 0.00 0.00 > > > (655092+654749+654230+656661)/(646515+653874+647246+650625) = > 1.00864886500966031113 > On average it gives around 0.8% increase in PPS, and this figure can > be reproduced consistently. > > > > > > > > > Without PP: > > > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > > > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > > > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > > > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > > > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > > > > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > > > > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > > > > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > > > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > > > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > > > > > > > > > > The major factor contributing to the performance drop is the reduction > > > > of skb coalescing. Additionally, without the page pool, small packets > > > > can still benefit from the allocation of 8 continuous pages by > > > > breaking them down into smaller pieces. This effectively reduces the > > > > frequency of page allocation from the buddy system. For instance, the > > > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > > > the benefits of using a page pool are limited in such cases. > > > > I wonder if we can improve page pool in this case anyhow. > > > > We would like to make the effort to enhance skb coalecsing to be more > friendly with page pool buffers. But that involves modifications to > some core data structure of mm. > > Just to give an update on the status of v2 progress. We have identified the specific situation where skb coalescing is affected by page pool and small packets. In summary, it was because, in our environment (Fedora 37 default network setup), NetworkManager creates a SOCK_DGRAM socket, which eventually results in skbs being cloned, thus causing the failure of skb coalescing. While it affects small packet performance in our environment, it doesn't necessarily have the same impact for other users. Nevertheless, we are trying to make an optimization to allow skb coalescing in that situation. It may take some time as we are advised to wait for some relevant prior patches to land first. (https://patchwork.kernel.org/project/netdevbpf/patch/20230628121150.47778-1-liangchen.linux@gmail.com/) In addition, if small packet performance is not dropping anymore, perhaps we can enable page pool for the normal path as well. Thanks, Liang > > > In fact, > > > > without page pool fragmenting enabled, it can even hinder performance > > > > from this perspective. > > > > > > > > Upon further consideration, I tend to believe making page pool the > > > > default option may not be appropriate. As you pointed out, we cannot > > > > simply ignore the performance impact on small packets. Any comments on > > > > this will be much appreciated. > > > > > > > > > > > > Thanks, > > > > Liang > > > > > > > > > So, let's only use page pool for XDP then? > > > > +1 > > > > We can start from this. > > > > Thanks > > > > > > > > > > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > > > > > > > --- > > > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > > > module_param(gso, bool, 0444); > > > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > > > + > > > > > > > > > /* FIXME: MTU in config. */ > > > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > > > struct page *pages; > > > > > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > > > + struct page_pool *page_pool; > > > > > > > > > + > > > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > > > return skb; > > > > > > > > > } > > > > > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > > > +{ > > > > > > > > > + if (rq->page_pool) > > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > > + else > > > > > > > > > + put_page(page); > > > > > > > > > +} > > > > > > > > > + > > > > > > > > > /* Called from bottom half context */ > > > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > > struct receive_queue *rq, > > > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > > > if (page_to_free) > > > > > > > > > - put_page(page_to_free); > > > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > > > > > return skb; > > > > > > > > > } > > > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > > > return ret; > > > > > > > > > } > > > > > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > > > { > > > > > > > > > struct skb_shared_info *shinfo; > > > > > > > > > struct page *xdp_page; > > > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > > > - put_page(xdp_page); > > > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > > > } > > > > > > > > > } > > > > > > > > > } > > > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > > > + if (rq->page_pool) > > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > + else > > > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > > > + > > > > > > > > > if (!page) > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > > * is sending packet larger than the MTU. > > > > > > > > > */ > > > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > > > - put_page(p); > > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > > goto err_buf; > > > > > > > > > } > > > > > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > > > page_address(p) + off, buflen); > > > > > > > > > page_off += buflen; > > > > > > > > > - put_page(p); > > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > > } > > > > > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > > > return page; > > > > > > > > > err_buf: > > > > > > > > > - __free_pages(page, 0); > > > > > > > > > + if (rq->page_pool) > > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > > + else > > > > > > > > > + __free_pages(page, 0); > > > > > > > > > return NULL; > > > > > > > > > } > > > > > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > > > } > > > > > > > > > stats->bytes += len; > > > > > > > > > page = virt_to_head_page(buf); > > > > > > > > > - put_page(page); > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > } > > > > > > > > > } > > > > > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > > cur_frag_size = truesize; > > > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > > > - put_page(page); > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > > > dev->stats.rx_length_errors++; > > > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > > return 0; > > > > > > > > > > > > > > > > > > err: > > > > > > > > > - put_xdp_frags(xdp); > > > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > > > return -EINVAL; > > > > > > > > > } > > > > > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > > + if (rq->page_pool) > > > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > + else > > > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > > if (!xdp_page) > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > > > > > - put_page(*page); > > > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > > > if (unlikely(!head_skb)) > > > > > > > > > break; > > > > > > > > > + if (rq->page_pool) > > > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > > > case XDP_TX: > > > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > > break; > > > > > > > > > } > > > > > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > > > > > err_xdp: > > > > > > > > > - put_page(page); > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > > + > > > > > > > > > if (unlikely(!curr_skb)) > > > > > > > > > goto err_skb; > > > > > > > > > while (--num_buf) { > > > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > curr_skb = nskb; > > > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > > > num_skb_frags = 0; > > > > > > > > > + if (rq->page_pool) > > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > > } > > > > > > > > > if (curr_skb != head_skb) { > > > > > > > > > head_skb->data_len += len; > > > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > } > > > > > > > > > offset = buf - page_address(page); > > > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > > > - put_page(page); > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > > > len, truesize); > > > > > > > > > } else { > > > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > > > err_skb: > > > > > > > > > - put_page(page); > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > > > err_buf: > > > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > > > */ > > > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > > - return -ENOMEM; > > > > > > > > > + if (rq->page_pool) { > > > > > > > > > + struct page *page; > > > > > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > - get_page(alloc_frag->page); > > > > > > > > > - alloc_frag->offset += len + room; > > > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > > - if (hole < len + room) { > > > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > > > - * the current buffer. > > > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > > - */ > > > > > > > > > - if (!headroom) > > > > > > > > > - len += hole; > > > > > > > > > - alloc_frag->offset += hole; > > > > > > > > > - } > > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > + if (unlikely(!page)) > > > > > > > > > + return -ENOMEM; > > > > > > > > > + buf = (char *)page_address(page); > > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > + } else { > > > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > + get_page(alloc_frag->page); > > > > > > > > > + alloc_frag->offset += len + room; > > > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > > + if (hole < len + room) { > > > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > > > + * the current buffer. > > > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > > + */ > > > > > > > > > + if (!headroom) > > > > > > > > > + len += hole; > > > > > > > > > + alloc_frag->offset += hole; > > > > > > > > > + } > > > > > > > > > + } > > > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > > > if (err < 0) > > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > > > > > return err; > > > > > > > > > } > > > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > > > if (err < 0) > > > > > > > > > return err; > > > > > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > > > + else > > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > > > + NULL); > > > > > > > > > + > > > > > > > > > if (err < 0) > > > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > > > } > > > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > > > break; > > > > > > > > > } > > > > > > > > > } > > > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > > > switch (sset) { > > > > > > > > > case ETH_SS_STATS: > > > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > > > default: > > > > > > > > > return -EOPNOTSUPP; > > > > > > > > > } > > > > > > > > > } > > > > > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > > > +{ > > > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > > > + int i; > > > > > > > > > + > > > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > > > + continue; > > > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > > > + } > > > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > > > +} > > > > > > > > > + > > > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > > > { > > > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > > > } > > > > > > > > > + > > > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > > > } > > > > > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > > > } > > > > > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > > - else if (vi->big_packets) > > > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > > > + virt_to_head_page(buf), > > > > > > > > > + true); > > > > > > > > > + } else { > > > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > > > + } > > > > > > > > > + } else if (vi->big_packets) { > > > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > > > - else > > > > > > > > > + } else { > > > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > > > + } > > > > > > > > > } > > > > > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > > > virtnet_free_queues(vi); > > > > > > > > > } > > > > > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > > > +{ > > > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > > > + > > > > > > > > > + struct page_pool_params pp_params = { > > > > > > > > > + .order = 0, > > > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > > > + .dev = vdev->dev.parent, > > > > > > > > > + .offset = 0, > > > > > > > > > + }; > > > > > > > > > + > > > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > > > + rq->page_pool = NULL; > > > > > > > > > + } > > > > > > > > > +} > > > > > > > > > + > > > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > > > * least one full packet? > > > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > > > + > > > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > > > + else > > > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > > > + > > > > > > > > > } > > > > > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > > > -- > > > > > > > > > 2.31.1 > > > > > > > > > > > > > > > > > > >
On Wed, Jul 5, 2023 at 1:41 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > On Fri, Jun 9, 2023 at 10:57 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > On Thu, Jun 8, 2023 at 8:38 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > > > > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > > > > in the normal path. > > > > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > > > > gain is observed in XDP cpumap: > > > > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > > > > > > > > > Why off by default? > > > > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > > > > packet size is very small, it reduces the likelihood of skb > > > > > > > > coalescing. But such cases are rare. > > > > > > > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > > > > > > > > > > Sure, Thanks! > > > > > > > > > > Before going ahead and posting v2 patch, I would like to hear more > > > > > advice for the cases of small packets. I have done more performance > > > > > benchmark with small packets since then. Here is a list of iperf > > > > > output, > > > > > > > > > > With PP and PP fragmenting: > > > > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > > > > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > > > > > 223 KBytes > > > > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > > > > > 324 KBytes > > > > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > > > > > 1.08 MBytes > > > > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > > > > > 744 KBytes > > > > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > > > > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > > > > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > > > > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > > > > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > > > > > Note that virtio-net driver is lacking things like BQL and others, so > > > it might suffer from buffer bloat for TCP performance. Would you mind > > > to measure with e.g using testpmd on the vhost to see the rx PPS? > > > > > > > No problem. Before we proceed to measure with testpmd, could you > > please take a look at the PPS measurements we obtained previously and > > see if they are sufficient? Though we will only utilize page pool for > > xdp on v2. > > > > netperf -H 192.168.124.197 -p 4444 -t UDP_STREAM -l 0 -- -m $((1)) > > > > with page pool: > > 1. > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > rxcmp/s txcmp/s rxmcst/s %ifutil > > Average: enp8s0 655092.27 0.35 27508.77 0.03 > > 0.00 0.00 0.00 0.00 > > 2. > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > rxcmp/s txcmp/s rxmcst/s %ifutil > > Average: enp8s0 654749.87 0.63 27494.42 0.05 > > 0.00 0.00 0.00 0.00 > > 3. > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > rxcmp/s txcmp/s rxmcst/s %ifutil > > Average: enp8s0 654230.40 0.10 27472.57 0.01 > > 0.00 0.00 0.00 0.00 > > 4. > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > rxcmp/s txcmp/s rxmcst/s %ifutil > > Average: enp8s0 656661.33 0.15 27574.65 0.01 > > 0.00 0.00 0.00 0.00 > > > > > > without page pool: > > 1. > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > rxcmp/s txcmp/s rxmcst/s %ifutil > > Average: enp8s0 646515.20 0.47 27148.60 0.04 > > 0.00 0.00 0.00 0.00 > > 2. > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > rxcmp/s txcmp/s rxmcst/s %ifutil > > Average: enp8s0 653874.13 0.18 27457.61 0.02 > > 0.00 0.00 0.00 0.00 > > 3. > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > rxcmp/s txcmp/s rxmcst/s %ifutil > > Average: enp8s0 647246.93 0.15 27179.32 0.01 > > 0.00 0.00 0.00 0.00 > > 4. > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > rxcmp/s txcmp/s rxmcst/s %ifutil > > Average: enp8s0 650625.07 0.27 27321.18 0.02 > > 0.00 0.00 0.00 0.00 > > > > > > (655092+654749+654230+656661)/(646515+653874+647246+650625) = > > 1.00864886500966031113 > > On average it gives around 0.8% increase in PPS, and this figure can > > be reproduced consistently. > > > > > > > > > > > > Without PP: > > > > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > > > > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > > > > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > > > > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > > > > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > > > > > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > > > > > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > > > > > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > > > > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > > > > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > > > > > > > > > > > > > The major factor contributing to the performance drop is the reduction > > > > > of skb coalescing. Additionally, without the page pool, small packets > > > > > can still benefit from the allocation of 8 continuous pages by > > > > > breaking them down into smaller pieces. This effectively reduces the > > > > > frequency of page allocation from the buddy system. For instance, the > > > > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > > > > the benefits of using a page pool are limited in such cases. > > > > > > I wonder if we can improve page pool in this case anyhow. > > > > > > > We would like to make the effort to enhance skb coalecsing to be more > > friendly with page pool buffers. But that involves modifications to > > some core data structure of mm. > > > > > > Just to give an update on the status of v2 progress. We have > identified the specific situation where skb coalescing is affected by > page pool and small packets. In summary, it was because, in our > environment (Fedora 37 default network setup), NetworkManager creates > a SOCK_DGRAM socket, which eventually results in skbs being cloned, > thus causing the failure of skb coalescing. > > While it affects small packet performance in our environment, it > doesn't necessarily have the same impact for other users. > Nevertheless, we are trying to make an optimization to allow skb > coalescing in that situation. It may take some time as we are advised > to wait for some relevant prior patches to land first. > (https://patchwork.kernel.org/project/netdevbpf/patch/20230628121150.47778-1-liangchen.linux@gmail.com/) If it's not too late, I would like to be copied in the next version of this. (Since it seems to be suggested by me). > > In addition, if small packet performance is not dropping anymore, > perhaps we can enable page pool for the normal path as well. It would be simpler if we start from XDP and normal on top. But it's your call anyway. Thanks > > > Thanks, > Liang > > > > > > > In fact, > > > > > without page pool fragmenting enabled, it can even hinder performance > > > > > from this perspective. > > > > > > > > > > Upon further consideration, I tend to believe making page pool the > > > > > default option may not be appropriate. As you pointed out, we cannot > > > > > simply ignore the performance impact on small packets. Any comments on > > > > > this will be much appreciated. > > > > > > > > > > > > > > > Thanks, > > > > > Liang > > > > > > > > > > > > So, let's only use page pool for XDP then? > > > > > > +1 > > > > > > We can start from this. > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > > > > > > > > > --- > > > > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > > > > module_param(gso, bool, 0444); > > > > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > > > > + > > > > > > > > > > /* FIXME: MTU in config. */ > > > > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > > > > struct page *pages; > > > > > > > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > > > > + struct page_pool *page_pool; > > > > > > > > > > + > > > > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > > > > return skb; > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > > > > +{ > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > > > + else > > > > > > > > > > + put_page(page); > > > > > > > > > > +} > > > > > > > > > > + > > > > > > > > > > /* Called from bottom half context */ > > > > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > > > struct receive_queue *rq, > > > > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > > > > if (page_to_free) > > > > > > > > > > - put_page(page_to_free); > > > > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > > > > > > > return skb; > > > > > > > > > > } > > > > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > > > > return ret; > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > > > > { > > > > > > > > > > struct skb_shared_info *shinfo; > > > > > > > > > > struct page *xdp_page; > > > > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > > > > - put_page(xdp_page); > > > > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > > > > } > > > > > > > > > > } > > > > > > > > > > } > > > > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > > + else > > > > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > > > > + > > > > > > > > > > if (!page) > > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > > > * is sending packet larger than the MTU. > > > > > > > > > > */ > > > > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > > > > - put_page(p); > > > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > > > goto err_buf; > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > > > > page_address(p) + off, buflen); > > > > > > > > > > page_off += buflen; > > > > > > > > > > - put_page(p); > > > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > > > > return page; > > > > > > > > > > err_buf: > > > > > > > > > > - __free_pages(page, 0); > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > > > + else > > > > > > > > > > + __free_pages(page, 0); > > > > > > > > > > return NULL; > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > > > > } > > > > > > > > > > stats->bytes += len; > > > > > > > > > > page = virt_to_head_page(buf); > > > > > > > > > > - put_page(page); > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > } > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > > > cur_frag_size = truesize; > > > > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > > > > - put_page(page); > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > > > > dev->stats.rx_length_errors++; > > > > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > > > return 0; > > > > > > > > > > > > > > > > > > > > err: > > > > > > > > > > - put_xdp_frags(xdp); > > > > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > > > > return -EINVAL; > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > > + else > > > > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > > > if (!xdp_page) > > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > > > > > > > - put_page(*page); > > > > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > > > > if (unlikely(!head_skb)) > > > > > > > > > > break; > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > > > > > case XDP_TX: > > > > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > > > break; > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > > > > > > > err_xdp: > > > > > > > > > > - put_page(page); > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > > > + > > > > > > > > > > if (unlikely(!curr_skb)) > > > > > > > > > > goto err_skb; > > > > > > > > > > while (--num_buf) { > > > > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > > curr_skb = nskb; > > > > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > > > > num_skb_frags = 0; > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > > > } > > > > > > > > > > if (curr_skb != head_skb) { > > > > > > > > > > head_skb->data_len += len; > > > > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > > } > > > > > > > > > > offset = buf - page_address(page); > > > > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > > > > - put_page(page); > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > > > > len, truesize); > > > > > > > > > > } else { > > > > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > > > > > err_skb: > > > > > > > > > > - put_page(page); > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > > > > > err_buf: > > > > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > > > > */ > > > > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > > > - return -ENOMEM; > > > > > > > > > > + if (rq->page_pool) { > > > > > > > > > > + struct page *page; > > > > > > > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > > - get_page(alloc_frag->page); > > > > > > > > > > - alloc_frag->offset += len + room; > > > > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > > > - if (hole < len + room) { > > > > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > > > > - * the current buffer. > > > > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > > > - */ > > > > > > > > > > - if (!headroom) > > > > > > > > > > - len += hole; > > > > > > > > > > - alloc_frag->offset += hole; > > > > > > > > > > - } > > > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > > + if (unlikely(!page)) > > > > > > > > > > + return -ENOMEM; > > > > > > > > > > + buf = (char *)page_address(page); > > > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > > + } else { > > > > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > > + get_page(alloc_frag->page); > > > > > > > > > > + alloc_frag->offset += len + room; > > > > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > > > + if (hole < len + room) { > > > > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > > > > + * the current buffer. > > > > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > > > + */ > > > > > > > > > > + if (!headroom) > > > > > > > > > > + len += hole; > > > > > > > > > > + alloc_frag->offset += hole; > > > > > > > > > > + } > > > > > > > > > > + } > > > > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > > > > if (err < 0) > > > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > > > > > > > return err; > > > > > > > > > > } > > > > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > > > > if (err < 0) > > > > > > > > > > return err; > > > > > > > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > > > > + else > > > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > > > > + NULL); > > > > > > > > > > + > > > > > > > > > > if (err < 0) > > > > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > > > > } > > > > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > > > > break; > > > > > > > > > > } > > > > > > > > > > } > > > > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > > > > switch (sset) { > > > > > > > > > > case ETH_SS_STATS: > > > > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > > > > default: > > > > > > > > > > return -EOPNOTSUPP; > > > > > > > > > > } > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > > > > +{ > > > > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > > > > + int i; > > > > > > > > > > + > > > > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > > > > + continue; > > > > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > > > > + } > > > > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > > > > +} > > > > > > > > > > + > > > > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > > > > { > > > > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > > > > } > > > > > > > > > > + > > > > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > > > - else if (vi->big_packets) > > > > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > > > > + virt_to_head_page(buf), > > > > > > > > > > + true); > > > > > > > > > > + } else { > > > > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > > > > + } > > > > > > > > > > + } else if (vi->big_packets) { > > > > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > > > > - else > > > > > > > > > > + } else { > > > > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > > > > + } > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > > > > virtnet_free_queues(vi); > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > > > > +{ > > > > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > > > > + > > > > > > > > > > + struct page_pool_params pp_params = { > > > > > > > > > > + .order = 0, > > > > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > > > > + .dev = vdev->dev.parent, > > > > > > > > > > + .offset = 0, > > > > > > > > > > + }; > > > > > > > > > > + > > > > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > > > > + rq->page_pool = NULL; > > > > > > > > > > + } > > > > > > > > > > +} > > > > > > > > > > + > > > > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > > > > * least one full packet? > > > > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > > > > + > > > > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > > > > + else > > > > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > > > > + > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > > > > -- > > > > > > > > > > 2.31.1 > > > > > > > > > > > > > > > > > > > > > > > >
On Wed, Jul 5, 2023 at 2:05 PM Jason Wang <jasowang@redhat.com> wrote: > > On Wed, Jul 5, 2023 at 1:41 PM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > On Fri, Jun 9, 2023 at 10:57 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > On Thu, Jun 8, 2023 at 8:38 AM Jason Wang <jasowang@redhat.com> wrote: > > > > > > > > On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > > > > > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote: > > > > > > > > > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > > > > > The implementation at the moment uses one page per packet in both the > > > > > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable > > > > > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain > > > > > > > > > > > in the normal path. > > > > > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant performance > > > > > > > > > > > gain is observed in XDP cpumap: > > > > > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and > > > > > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > > > > > > > > > > > > > > > > > > > > Why off by default? > > > > > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > > > > > packet size is very small, it reduces the likelihood of skb > > > > > > > > > coalescing. But such cases are rare. > > > > > > > > > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > > > > > > > > > > > > > Sure, Thanks! > > > > > > > > > > > > Before going ahead and posting v2 patch, I would like to hear more > > > > > > advice for the cases of small packets. I have done more performance > > > > > > benchmark with small packets since then. Here is a list of iperf > > > > > > output, > > > > > > > > > > > > With PP and PP fragmenting: > > > > > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec 0 144 KBytes > > > > > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec 0 > > > > > > 223 KBytes > > > > > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec 0 > > > > > > 324 KBytes > > > > > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec 0 > > > > > > 1.08 MBytes > > > > > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec 0 > > > > > > 744 KBytes > > > > > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec 0 963 KBytes > > > > > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec 0 1.25 MBytes > > > > > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec 0 1.70 MBytes > > > > > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec 0 4.26 MBytes > > > > > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec 0 3.20 MBytes > > > > > > > > Note that virtio-net driver is lacking things like BQL and others, so > > > > it might suffer from buffer bloat for TCP performance. Would you mind > > > > to measure with e.g using testpmd on the vhost to see the rx PPS? > > > > > > > > > > No problem. Before we proceed to measure with testpmd, could you > > > please take a look at the PPS measurements we obtained previously and > > > see if they are sufficient? Though we will only utilize page pool for > > > xdp on v2. > > > > > > netperf -H 192.168.124.197 -p 4444 -t UDP_STREAM -l 0 -- -m $((1)) > > > > > > with page pool: > > > 1. > > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > > rxcmp/s txcmp/s rxmcst/s %ifutil > > > Average: enp8s0 655092.27 0.35 27508.77 0.03 > > > 0.00 0.00 0.00 0.00 > > > 2. > > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > > rxcmp/s txcmp/s rxmcst/s %ifutil > > > Average: enp8s0 654749.87 0.63 27494.42 0.05 > > > 0.00 0.00 0.00 0.00 > > > 3. > > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > > rxcmp/s txcmp/s rxmcst/s %ifutil > > > Average: enp8s0 654230.40 0.10 27472.57 0.01 > > > 0.00 0.00 0.00 0.00 > > > 4. > > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > > rxcmp/s txcmp/s rxmcst/s %ifutil > > > Average: enp8s0 656661.33 0.15 27574.65 0.01 > > > 0.00 0.00 0.00 0.00 > > > > > > > > > without page pool: > > > 1. > > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > > rxcmp/s txcmp/s rxmcst/s %ifutil > > > Average: enp8s0 646515.20 0.47 27148.60 0.04 > > > 0.00 0.00 0.00 0.00 > > > 2. > > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > > rxcmp/s txcmp/s rxmcst/s %ifutil > > > Average: enp8s0 653874.13 0.18 27457.61 0.02 > > > 0.00 0.00 0.00 0.00 > > > 3. > > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > > rxcmp/s txcmp/s rxmcst/s %ifutil > > > Average: enp8s0 647246.93 0.15 27179.32 0.01 > > > 0.00 0.00 0.00 0.00 > > > 4. > > > Average: IFACE rxpck/s txpck/s rxkB/s txkB/s > > > rxcmp/s txcmp/s rxmcst/s %ifutil > > > Average: enp8s0 650625.07 0.27 27321.18 0.02 > > > 0.00 0.00 0.00 0.00 > > > > > > > > > (655092+654749+654230+656661)/(646515+653874+647246+650625) = > > > 1.00864886500966031113 > > > On average it gives around 0.8% increase in PPS, and this figure can > > > be reproduced consistently. > > > > > > > > > > > > > > > Without PP: > > > > > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec 0 359 KBytes > > > > > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec 0 730 KBytes > > > > > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec 0 1.99 MBytes > > > > > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec 0 1.20 MBytes > > > > > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec 0 1.72 MBytes > > > > > > 16K: [ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec 0 2.90 MBytes > > > > > > 32K: [ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec 0 3.03 MBytes > > > > > > 64K: [ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec 0 3.05 MBytes > > > > > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec 1 3.03 MBytes > > > > > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec 0 3.11 MBytes > > > > > > > > > > > > > > > > > > The major factor contributing to the performance drop is the reduction > > > > > > of skb coalescing. Additionally, without the page pool, small packets > > > > > > can still benefit from the allocation of 8 continuous pages by > > > > > > breaking them down into smaller pieces. This effectively reduces the > > > > > > frequency of page allocation from the buddy system. For instance, the > > > > > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > > > > > the benefits of using a page pool are limited in such cases. > > > > > > > > I wonder if we can improve page pool in this case anyhow. > > > > > > > > > > We would like to make the effort to enhance skb coalecsing to be more > > > friendly with page pool buffers. But that involves modifications to > > > some core data structure of mm. > > > > > > > > > > Just to give an update on the status of v2 progress. We have > > identified the specific situation where skb coalescing is affected by > > page pool and small packets. In summary, it was because, in our > > environment (Fedora 37 default network setup), NetworkManager creates > > a SOCK_DGRAM socket, which eventually results in skbs being cloned, > > thus causing the failure of skb coalescing. > > > > While it affects small packet performance in our environment, it > > doesn't necessarily have the same impact for other users. > > Nevertheless, we are trying to make an optimization to allow skb > > coalescing in that situation. It may take some time as we are advised > > to wait for some relevant prior patches to land first. > > (https://patchwork.kernel.org/project/netdevbpf/patch/20230628121150.47778-1-liangchen.linux@gmail.com/) > > If it's not too late, I would like to be copied in the next version of > this. (Since it seems to be suggested by me). > Sure, no problem. It indeed was suggested by you. Thanks, Liang > > > > In addition, if small packet performance is not dropping anymore, > > perhaps we can enable page pool for the normal path as well. > > It would be simpler if we start from XDP and normal on top. But it's > your call anyway. > > Thanks > > > > > > > Thanks, > > Liang > > > > > > > > > > > In fact, > > > > > > without page pool fragmenting enabled, it can even hinder performance > > > > > > from this perspective. > > > > > > > > > > > > Upon further consideration, I tend to believe making page pool the > > > > > > default option may not be appropriate. As you pointed out, we cannot > > > > > > simply ignore the performance impact on small packets. Any comments on > > > > > > this will be much appreciated. > > > > > > > > > > > > > > > > > > Thanks, > > > > > > Liang > > > > > > > > > > > > > > > So, let's only use page pool for XDP then? > > > > > > > > +1 > > > > > > > > We can start from this. > > > > > > > > Thanks > > > > > > > > > > > > > > > > > > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks! > > > > > > > > > > > > > > > > > > > > --- > > > > > > > > > > > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > > > > > > > > > > > 1 file changed, 146 insertions(+), 42 deletions(-) > > > > > > > > > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644 > > > > > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > > > > > > > > > > > module_param(gso, bool, 0444); > > > > > > > > > > > module_param(napi_tx, bool, 0644); > > > > > > > > > > > > > > > > > > > > > > +static bool page_pool_enabled; > > > > > > > > > > > +module_param(page_pool_enabled, bool, 0400); > > > > > > > > > > > + > > > > > > > > > > > /* FIXME: MTU in config. */ > > > > > > > > > > > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > > > > > > > > > > > #define GOOD_COPY_LEN 128 > > > > > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue { > > > > > > > > > > > /* Chain pages by the private ptr. */ > > > > > > > > > > > struct page *pages; > > > > > > > > > > > > > > > > > > > > > > + /* Page pool */ > > > > > > > > > > > + struct page_pool *page_pool; > > > > > > > > > > > + > > > > > > > > > > > /* Average packet length for mergeable receive buffers. */ > > > > > > > > > > > struct ewma_pkt_len mrg_avg_pkt_len; > > > > > > > > > > > > > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > > > > > > > > > > > return skb; > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > > > > > > > > > > > +{ > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > > > > + else > > > > > > > > > > > + put_page(page); > > > > > > > > > > > +} > > > > > > > > > > > + > > > > > > > > > > > /* Called from bottom half context */ > > > > > > > > > > > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > > > > struct receive_queue *rq, > > > > > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > > > > > > > > > > > hdr = skb_vnet_hdr(skb); > > > > > > > > > > > memcpy(hdr, hdr_p, hdr_len); > > > > > > > > > > > if (page_to_free) > > > > > > > > > > > - put_page(page_to_free); > > > > > > > > > > > + virtnet_put_page(rq, page_to_free); > > > > > > > > > > > > > > > > > > > > > > return skb; > > > > > > > > > > > } > > > > > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > > > > > > > > > > return ret; > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > > > > > > > > > > > { > > > > > > > > > > > struct skb_shared_info *shinfo; > > > > > > > > > > > struct page *xdp_page; > > > > > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > > > > > > > > > > > shinfo = xdp_get_shared_info_from_buff(xdp); > > > > > > > > > > > for (i = 0; i < shinfo->nr_frags; i++) { > > > > > > > > > > > xdp_page = skb_frag_page(&shinfo->frags[i]); > > > > > > > > > > > - put_page(xdp_page); > > > > > > > > > > > + virtnet_put_page(rq, xdp_page); > > > > > > > > > > > } > > > > > > > > > > > } > > > > > > > > > > > } > > > > > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > > > > if (page_off + *len + tailroom > PAGE_SIZE) > > > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > > > > > - page = alloc_page(GFP_ATOMIC); > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > > > + else > > > > > > > > > > > + page = alloc_page(GFP_ATOMIC); > > > > > > > > > > > + > > > > > > > > > > > if (!page) > > > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > > > > > > > > > > > * is sending packet larger than the MTU. > > > > > > > > > > > */ > > > > > > > > > > > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > > > > > > > > > > > - put_page(p); > > > > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > > > > goto err_buf; > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > memcpy(page_address(page) + page_off, > > > > > > > > > > > page_address(p) + off, buflen); > > > > > > > > > > > page_off += buflen; > > > > > > > > > > > - put_page(p); > > > > > > > > > > > + virtnet_put_page(rq, p); > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > /* Headroom does not contribute to packet length */ > > > > > > > > > > > *len = page_off - VIRTIO_XDP_HEADROOM; > > > > > > > > > > > return page; > > > > > > > > > > > err_buf: > > > > > > > > > > > - __free_pages(page, 0); > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > > + page_pool_put_full_page(rq->page_pool, page, true); > > > > > > > > > > > + else > > > > > > > > > > > + __free_pages(page, 0); > > > > > > > > > > > return NULL; > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > > > > > > > > > > > } > > > > > > > > > > > stats->bytes += len; > > > > > > > > > > > page = virt_to_head_page(buf); > > > > > > > > > > > - put_page(page); > > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > > } > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > > > > cur_frag_size = truesize; > > > > > > > > > > > xdp_frags_truesz += cur_frag_size; > > > > > > > > > > > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > > > > > > > > > > > - put_page(page); > > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > > > > > > > > > > > dev->name, len, (unsigned long)(truesize - room)); > > > > > > > > > > > dev->stats.rx_length_errors++; > > > > > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > > > > > > > > > > > return 0; > > > > > > > > > > > > > > > > > > > > > > err: > > > > > > > > > > > - put_xdp_frags(xdp); > > > > > > > > > > > + put_xdp_frags(xdp, rq); > > > > > > > > > > > return -EINVAL; > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > if (*len + xdp_room > PAGE_SIZE) > > > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > > > > > - xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > > > + else > > > > > > > > > > > + xdp_page = alloc_page(GFP_ATOMIC); > > > > > > > > > > > if (!xdp_page) > > > > > > > > > > > return NULL; > > > > > > > > > > > > > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > > > > > > > > > > > > > > > > > > > > > *frame_sz = PAGE_SIZE; > > > > > > > > > > > > > > > > > > > > > > - put_page(*page); > > > > > > > > > > > + virtnet_put_page(rq, *page); > > > > > > > > > > > > > > > > > > > > > > *page = xdp_page; > > > > > > > > > > > > > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > > > > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > > > > > > > > > > > if (unlikely(!head_skb)) > > > > > > > > > > > break; > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > > + skb_mark_for_recycle(head_skb); > > > > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > > > > > > > case XDP_TX: > > > > > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > > > > > > > > > > > break; > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > - put_xdp_frags(&xdp); > > > > > > > > > > > + put_xdp_frags(&xdp, rq); > > > > > > > > > > > > > > > > > > > > > > err_xdp: > > > > > > > > > > > - put_page(page); > > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > > > > > > > stats->xdp_drops++; > > > > > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > > > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > > > > > > > > > > > curr_skb = head_skb; > > > > > > > > > > > > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > > > > + > > > > > > > > > > > if (unlikely(!curr_skb)) > > > > > > > > > > > goto err_skb; > > > > > > > > > > > while (--num_buf) { > > > > > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > > > curr_skb = nskb; > > > > > > > > > > > head_skb->truesize += nskb->truesize; > > > > > > > > > > > num_skb_frags = 0; > > > > > > > > > > > + if (rq->page_pool) > > > > > > > > > > > + skb_mark_for_recycle(curr_skb); > > > > > > > > > > > } > > > > > > > > > > > if (curr_skb != head_skb) { > > > > > > > > > > > head_skb->data_len += len; > > > > > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > > > } > > > > > > > > > > > offset = buf - page_address(page); > > > > > > > > > > > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > > > > > > > > > > > - put_page(page); > > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > > > > > > > > > > > len, truesize); > > > > > > > > > > > } else { > > > > > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > > > > > > > > > > > return head_skb; > > > > > > > > > > > > > > > > > > > > > > err_skb: > > > > > > > > > > > - put_page(page); > > > > > > > > > > > + virtnet_put_page(rq, page); > > > > > > > > > > > mergeable_buf_free(rq, num_buf, dev, stats); > > > > > > > > > > > > > > > > > > > > > > err_buf: > > > > > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > > > > > > > > > > > * disabled GSO for XDP, it won't be a big issue. > > > > > > > > > > > */ > > > > > > > > > > > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > > > > > > > > > > > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > > > > - return -ENOMEM; > > > > > > > > > > > + if (rq->page_pool) { > > > > > > > > > > > + struct page *page; > > > > > > > > > > > > > > > > > > > > > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > > > > - buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > > > - get_page(alloc_frag->page); > > > > > > > > > > > - alloc_frag->offset += len + room; > > > > > > > > > > > - hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > > > > - if (hole < len + room) { > > > > > > > > > > > - /* To avoid internal fragmentation, if there is very likely not > > > > > > > > > > > - * enough space for another buffer, add the remaining space to > > > > > > > > > > > - * the current buffer. > > > > > > > > > > > - * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > > > > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > > > > - */ > > > > > > > > > > > - if (!headroom) > > > > > > > > > > > - len += hole; > > > > > > > > > > > - alloc_frag->offset += hole; > > > > > > > > > > > - } > > > > > > > > > > > + page = page_pool_dev_alloc_pages(rq->page_pool); > > > > > > > > > > > + if (unlikely(!page)) > > > > > > > > > > > + return -ENOMEM; > > > > > > > > > > > + buf = (char *)page_address(page); > > > > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > > > + } else { > > > > > > > > > > > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > > > > > > > > > > > + return -ENOMEM; > > > > > > > > > > > > > > > > > > > > > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > > > > > > > > > > > + buf += headroom; /* advance address leaving hole at front of pkt */ > > > > > > > > > > > + get_page(alloc_frag->page); > > > > > > > > > > > + alloc_frag->offset += len + room; > > > > > > > > > > > + hole = alloc_frag->size - alloc_frag->offset; > > > > > > > > > > > + if (hole < len + room) { > > > > > > > > > > > + /* To avoid internal fragmentation, if there is very likely not > > > > > > > > > > > + * enough space for another buffer, add the remaining space to > > > > > > > > > > > + * the current buffer. > > > > > > > > > > > + * XDP core assumes that frame_size of xdp_buff and the length > > > > > > > > > > > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > > > > > > > > > > > + */ > > > > > > > > > > > + if (!headroom) > > > > > > > > > > > + len += hole; > > > > > > > > > > > + alloc_frag->offset += hole; > > > > > > > > > > > + } > > > > > > > > > > > + } > > > > > > > > > > > sg_init_one(rq->sg, buf, len); > > > > > > > > > > > ctx = mergeable_len_to_ctx(len + room, headroom); > > > > > > > > > > > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > > > > > > > > > > > if (err < 0) > > > > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > > > > + virtnet_put_page(rq, virt_to_head_page(buf)); > > > > > > > > > > > > > > > > > > > > > > return err; > > > > > > > > > > > } > > > > > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > > > > > > > > > > if (err < 0) > > > > > > > > > > > return err; > > > > > > > > > > > > > > > > > > > > > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > > > - MEM_TYPE_PAGE_SHARED, NULL); > > > > > > > > > > > + if (vi->rq[qp_index].page_pool) > > > > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > > > + MEM_TYPE_PAGE_POOL, > > > > > > > > > > > + vi->rq[qp_index].page_pool); > > > > > > > > > > > + else > > > > > > > > > > > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > > > > > > > > > > > + MEM_TYPE_PAGE_SHARED, > > > > > > > > > > > + NULL); > > > > > > > > > > > + > > > > > > > > > > > if (err < 0) > > > > > > > > > > > goto err_xdp_reg_mem_model; > > > > > > > > > > > > > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > > > > > > > > > > > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > > > > > > > > > > > virtnet_sq_stats_desc[j].desc); > > > > > > > > > > > } > > > > > > > > > > > + page_pool_ethtool_stats_get_strings(p); > > > > > > > > > > > break; > > > > > > > > > > > } > > > > > > > > > > > } > > > > > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > > > > > > > > > > > switch (sset) { > > > > > > > > > > > case ETH_SS_STATS: > > > > > > > > > > > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > > > > > > > > > > > - VIRTNET_SQ_STATS_LEN); > > > > > > > > > > > + VIRTNET_SQ_STATS_LEN + > > > > > > > > > > > + (page_pool_enabled && vi->mergeable_rx_bufs ? > > > > > > > > > > > + page_pool_ethtool_stats_get_count() : 0)); > > > > > > > > > > > default: > > > > > > > > > > > return -EOPNOTSUPP; > > > > > > > > > > > } > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > > > > > > > > > > > +{ > > > > > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS > > > > > > > > > > > + struct virtnet_info *vi = netdev_priv(dev); > > > > > > > > > > > + struct page_pool_stats pp_stats = {}; > > > > > > > > > > > + int i; > > > > > > > > > > > + > > > > > > > > > > > + for (i = 0; i < vi->curr_queue_pairs; i++) { > > > > > > > > > > > + if (!vi->rq[i].page_pool) > > > > > > > > > > > + continue; > > > > > > > > > > > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > > > > > > > > > > > + } > > > > > > > > > > > + page_pool_ethtool_stats_get(data, &pp_stats); > > > > > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */ > > > > > > > > > > > +} > > > > > > > > > > > + > > > > > > > > > > > static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > > > > struct ethtool_stats *stats, u64 *data) > > > > > > > > > > > { > > > > > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > > > > > > > > > > > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > > > > > > > > > > > idx += VIRTNET_SQ_STATS_LEN; > > > > > > > > > > > } > > > > > > > > > > > + > > > > > > > > > > > + virtnet_get_page_pool_stats(dev, &data[idx]); > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > static void virtnet_get_channels(struct net_device *dev, > > > > > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > > > > > > > > > > > for (i = 0; i < vi->max_queue_pairs; i++) { > > > > > > > > > > > __netif_napi_del(&vi->rq[i].napi); > > > > > > > > > > > __netif_napi_del(&vi->sq[i].napi); > > > > > > > > > > > + if (vi->rq[i].page_pool) > > > > > > > > > > > + page_pool_destroy(vi->rq[i].page_pool); > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > /* We called __netif_napi_del(), > > > > > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > > > > > > > > > > > struct virtnet_info *vi = vq->vdev->priv; > > > > > > > > > > > int i = vq2rxq(vq); > > > > > > > > > > > > > > > > > > > > > > - if (vi->mergeable_rx_bufs) > > > > > > > > > > > - put_page(virt_to_head_page(buf)); > > > > > > > > > > > - else if (vi->big_packets) > > > > > > > > > > > + if (vi->mergeable_rx_bufs) { > > > > > > > > > > > + if (vi->rq[i].page_pool) { > > > > > > > > > > > + page_pool_put_full_page(vi->rq[i].page_pool, > > > > > > > > > > > + virt_to_head_page(buf), > > > > > > > > > > > + true); > > > > > > > > > > > + } else { > > > > > > > > > > > + put_page(virt_to_head_page(buf)); > > > > > > > > > > > + } > > > > > > > > > > > + } else if (vi->big_packets) { > > > > > > > > > > > give_pages(&vi->rq[i], buf); > > > > > > > > > > > - else > > > > > > > > > > > + } else { > > > > > > > > > > > put_page(virt_to_head_page(buf)); > > > > > > > > > > > + } > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > static void free_unused_bufs(struct virtnet_info *vi) > > > > > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > > > > > > > > > > > virtnet_free_queues(vi); > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > > > > > > > > > > > +{ > > > > > > > > > > > + struct virtio_device *vdev = rq->vq->vdev; > > > > > > > > > > > + > > > > > > > > > > > + struct page_pool_params pp_params = { > > > > > > > > > > > + .order = 0, > > > > > > > > > > > + .pool_size = rq->vq->num_max, > > > > > > > > > > > + .nid = dev_to_node(vdev->dev.parent), > > > > > > > > > > > + .dev = vdev->dev.parent, > > > > > > > > > > > + .offset = 0, > > > > > > > > > > > + }; > > > > > > > > > > > + > > > > > > > > > > > + rq->page_pool = page_pool_create(&pp_params); > > > > > > > > > > > + if (IS_ERR(rq->page_pool)) { > > > > > > > > > > > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > > > > > > > > > > > + PTR_ERR(rq->page_pool)); > > > > > > > > > > > + rq->page_pool = NULL; > > > > > > > > > > > + } > > > > > > > > > > > +} > > > > > > > > > > > + > > > > > > > > > > > /* How large should a single buffer be so a queue full of these can fit at > > > > > > > > > > > * least one full packet? > > > > > > > > > > > * Logic below assumes the mergeable buffer header is used. > > > > > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > > > > > > > > vi->rq[i].vq = vqs[rxq2vq(i)]; > > > > > > > > > > > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > > > > > > > > > > > vi->sq[i].vq = vqs[txq2vq(i)]; > > > > > > > > > > > + > > > > > > > > > > > + if (page_pool_enabled && vi->mergeable_rx_bufs) > > > > > > > > > > > + virtnet_alloc_page_pool(&vi->rq[i]); > > > > > > > > > > > + else > > > > > > > > > > > + dev_warn(&vi->vdev->dev, > > > > > > > > > > > + "page pool only support mergeable mode\n"); > > > > > > > > > > > + > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > /* run here: ret == 0. */ > > > > > > > > > > > -- > > > > > > > > > > > 2.31.1 > > > > > > > > > > > > > > > > > > > > > > > > > > > > > >
在 2023/5/26 13:46, Liang Chen 写道: > The implementation at the moment uses one page per packet in both the > normal and XDP path. In addition, introducing a module parameter to enable > or disable the usage of page pool (disabled by default). > > In single-core vm testing environments, it gives a modest performance gain > in the normal path. > Upstream codebase: 47.5 Gbits/sec > Upstream codebase + page_pool support: 50.2 Gbits/sec > > In multi-core vm testing environments, The most significant performance > gain is observed in XDP cpumap: > Upstream codebase: 1.38 Gbits/sec > Upstream codebase + page_pool support: 9.74 Gbits/sec > > With this foundation, we can further integrate page pool fragmentation and > DMA map/unmap support. > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > --- > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > 1 file changed, 146 insertions(+), 42 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index c5dca0d92e64..99c0ca0c1781 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > module_param(gso, bool, 0444); > module_param(napi_tx, bool, 0644); > > +static bool page_pool_enabled; > +module_param(page_pool_enabled, bool, 0400); > + > /* FIXME: MTU in config. */ > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > #define GOOD_COPY_LEN 128 > @@ -159,6 +162,9 @@ struct receive_queue { > /* Chain pages by the private ptr. */ > struct page *pages; > > + /* Page pool */ > + struct page_pool *page_pool; > + > /* Average packet length for mergeable receive buffers. */ > struct ewma_pkt_len mrg_avg_pkt_len; > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > return skb; > } > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > +{ > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + put_page(page); > +} > + > /* Called from bottom half context */ > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > struct receive_queue *rq, > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > hdr = skb_vnet_hdr(skb); > memcpy(hdr, hdr_p, hdr_len); > if (page_to_free) > - put_page(page_to_free); > + virtnet_put_page(rq, page_to_free); > > return skb; > } > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > return ret; > } > > -static void put_xdp_frags(struct xdp_buff *xdp) > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > { > struct skb_shared_info *shinfo; > struct page *xdp_page; > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > shinfo = xdp_get_shared_info_from_buff(xdp); > for (i = 0; i < shinfo->nr_frags; i++) { > xdp_page = skb_frag_page(&shinfo->frags[i]); > - put_page(xdp_page); > + virtnet_put_page(rq, xdp_page); > } > } > } > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > if (page_off + *len + tailroom > PAGE_SIZE) > return NULL; > > - page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + page = alloc_page(GFP_ATOMIC); > + > if (!page) > return NULL; > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > * is sending packet larger than the MTU. > */ > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > - put_page(p); > + virtnet_put_page(rq, p); > goto err_buf; > } > > memcpy(page_address(page) + page_off, > page_address(p) + off, buflen); > page_off += buflen; > - put_page(p); > + virtnet_put_page(rq, p); > } > > /* Headroom does not contribute to packet length */ > *len = page_off - VIRTIO_XDP_HEADROOM; > return page; > err_buf: > - __free_pages(page, 0); > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + __free_pages(page, 0); > return NULL; > } > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > } > stats->bytes += len; > page = virt_to_head_page(buf); > - put_page(page); > + virtnet_put_page(rq, page); > } > } > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > cur_frag_size = truesize; > xdp_frags_truesz += cur_frag_size; > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > - put_page(page); > + virtnet_put_page(rq, page); > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > dev->name, len, (unsigned long)(truesize - room)); > dev->stats.rx_length_errors++; > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > return 0; > > err: > - put_xdp_frags(xdp); > + put_xdp_frags(xdp, rq); > return -EINVAL; > } > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > if (*len + xdp_room > PAGE_SIZE) > return NULL; > > - xdp_page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + xdp_page = alloc_page(GFP_ATOMIC); > if (!xdp_page) > return NULL; > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > *frame_sz = PAGE_SIZE; > > - put_page(*page); > + virtnet_put_page(rq, *page); > > *page = xdp_page; > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > if (unlikely(!head_skb)) > break; > + if (rq->page_pool) > + skb_mark_for_recycle(head_skb); > return head_skb; > > case XDP_TX: > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > break; > } > > - put_xdp_frags(&xdp); > + put_xdp_frags(&xdp, rq); > > err_xdp: > - put_page(page); > + virtnet_put_page(rq, page); > mergeable_buf_free(rq, num_buf, dev, stats); > > stats->xdp_drops++; > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > curr_skb = head_skb; > > + if (rq->page_pool) > + skb_mark_for_recycle(curr_skb); > + > if (unlikely(!curr_skb)) > goto err_skb; > while (--num_buf) { > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > curr_skb = nskb; > head_skb->truesize += nskb->truesize; > num_skb_frags = 0; > + if (rq->page_pool) > + skb_mark_for_recycle(curr_skb); > } > if (curr_skb != head_skb) { > head_skb->data_len += len; > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > } > offset = buf - page_address(page); > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > - put_page(page); > + virtnet_put_page(rq, page); > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > len, truesize); > } else { > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > return head_skb; > > err_skb: > - put_page(page); > + virtnet_put_page(rq, page); > mergeable_buf_free(rq, num_buf, dev, stats); > > err_buf: > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > * disabled GSO for XDP, it won't be a big issue. > */ > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > - return -ENOMEM; > + if (rq->page_pool) { > + struct page *page; > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > - buf += headroom; /* advance address leaving hole at front of pkt */ > - get_page(alloc_frag->page); > - alloc_frag->offset += len + room; > - hole = alloc_frag->size - alloc_frag->offset; > - if (hole < len + room) { > - /* To avoid internal fragmentation, if there is very likely not > - * enough space for another buffer, add the remaining space to > - * the current buffer. > - * XDP core assumes that frame_size of xdp_buff and the length > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > - */ > - if (!headroom) > - len += hole; > - alloc_frag->offset += hole; > - } > + page = page_pool_dev_alloc_pages(rq->page_pool); > + if (unlikely(!page)) > + return -ENOMEM; > + buf = (char *)page_address(page); > + buf += headroom; /* advance address leaving hole at front of pkt */ > + } else { > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > + return -ENOMEM; > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > + buf += headroom; /* advance address leaving hole at front of pkt */ > + get_page(alloc_frag->page); > + alloc_frag->offset += len + room; > + hole = alloc_frag->size - alloc_frag->offset; > + if (hole < len + room) { > + /* To avoid internal fragmentation, if there is very likely not > + * enough space for another buffer, add the remaining space to > + * the current buffer. > + * XDP core assumes that frame_size of xdp_buff and the length > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > + */ > + if (!headroom) > + len += hole; > + alloc_frag->offset += hole; > + } > + } > sg_init_one(rq->sg, buf, len); > ctx = mergeable_len_to_ctx(len + room, headroom); > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > if (err < 0) > - put_page(virt_to_head_page(buf)); > + virtnet_put_page(rq, virt_to_head_page(buf)); > > return err; > } > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > if (err < 0) > return err; > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > - MEM_TYPE_PAGE_SHARED, NULL); > + if (vi->rq[qp_index].page_pool) > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > + MEM_TYPE_PAGE_POOL, > + vi->rq[qp_index].page_pool); > + else > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > + MEM_TYPE_PAGE_SHARED, > + NULL); > + > if (err < 0) > goto err_xdp_reg_mem_model; > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > virtnet_sq_stats_desc[j].desc); > } > + page_pool_ethtool_stats_get_strings(p); > break; > } > } > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > switch (sset) { > case ETH_SS_STATS: > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > - VIRTNET_SQ_STATS_LEN); > + VIRTNET_SQ_STATS_LEN + > + (page_pool_enabled && vi->mergeable_rx_bufs ? > + page_pool_ethtool_stats_get_count() : 0)); vi->curr_queue_pairs should not multiply page_pool_ethtool_stats_get_count(). Zhu Yanjun > default: > return -EOPNOTSUPP; > } > } > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > +{ > +#ifdef CONFIG_PAGE_POOL_STATS > + struct virtnet_info *vi = netdev_priv(dev); > + struct page_pool_stats pp_stats = {}; > + int i; > + > + for (i = 0; i < vi->curr_queue_pairs; i++) { > + if (!vi->rq[i].page_pool) > + continue; > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > + } > + page_pool_ethtool_stats_get(data, &pp_stats); > +#endif /* CONFIG_PAGE_POOL_STATS */ > +} > + > static void virtnet_get_ethtool_stats(struct net_device *dev, > struct ethtool_stats *stats, u64 *data) > { > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > idx += VIRTNET_SQ_STATS_LEN; > } > + > + virtnet_get_page_pool_stats(dev, &data[idx]); > } > > static void virtnet_get_channels(struct net_device *dev, > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > for (i = 0; i < vi->max_queue_pairs; i++) { > __netif_napi_del(&vi->rq[i].napi); > __netif_napi_del(&vi->sq[i].napi); > + if (vi->rq[i].page_pool) > + page_pool_destroy(vi->rq[i].page_pool); > } > > /* We called __netif_napi_del(), > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > struct virtnet_info *vi = vq->vdev->priv; > int i = vq2rxq(vq); > > - if (vi->mergeable_rx_bufs) > - put_page(virt_to_head_page(buf)); > - else if (vi->big_packets) > + if (vi->mergeable_rx_bufs) { > + if (vi->rq[i].page_pool) { > + page_pool_put_full_page(vi->rq[i].page_pool, > + virt_to_head_page(buf), > + true); > + } else { > + put_page(virt_to_head_page(buf)); > + } > + } else if (vi->big_packets) { > give_pages(&vi->rq[i], buf); > - else > + } else { > put_page(virt_to_head_page(buf)); > + } > } > > static void free_unused_bufs(struct virtnet_info *vi) > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > virtnet_free_queues(vi); > } > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > +{ > + struct virtio_device *vdev = rq->vq->vdev; > + > + struct page_pool_params pp_params = { > + .order = 0, > + .pool_size = rq->vq->num_max, > + .nid = dev_to_node(vdev->dev.parent), > + .dev = vdev->dev.parent, > + .offset = 0, > + }; > + > + rq->page_pool = page_pool_create(&pp_params); > + if (IS_ERR(rq->page_pool)) { > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > + PTR_ERR(rq->page_pool)); > + rq->page_pool = NULL; > + } > +} > + > /* How large should a single buffer be so a queue full of these can fit at > * least one full packet? > * Logic below assumes the mergeable buffer header is used. > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > vi->rq[i].vq = vqs[rxq2vq(i)]; > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > vi->sq[i].vq = vqs[txq2vq(i)]; > + > + if (page_pool_enabled && vi->mergeable_rx_bufs) > + virtnet_alloc_page_pool(&vi->rq[i]); > + else > + dev_warn(&vi->vdev->dev, > + "page pool only support mergeable mode\n"); > + > } > > /* run here: ret == 0. */
On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote:
> 在 2023/5/26 13:46, Liang Chen 写道:
what made you respond to a patch from May, now?
在 2023/11/29 22:59, Michael S. Tsirkin 写道: > On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote: >> 在 2023/5/26 13:46, Liang Chen 写道: > > what made you respond to a patch from May, now? I want to apply page_pool to our virtio_net. This virtio_net works on our device. I want to verify whether page_pool on virtio_net with our device can improve the performance or not. And I found that ethtool is wrong. I use virtio_net on our device. I found that page member variable in rq is not used in recv path. When virtio_net is modprobe, I checked page member variable in rq with kprobe or crash tool. page member variable in rq is always NULL. But sg in recv path is used. So how to use page member variable in rq? If page member variable in rq is always NULL, can we remove it? BTW, I use ping and iperf tool to make tests with virtio_net. In the tests, page member variable in rq is always NULL. It is interesting. Zhu Yanjun >
在 2023/11/29 23:22, Zhu Yanjun 写道: > > 在 2023/11/29 22:59, Michael S. Tsirkin 写道: >> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote: >>> 在 2023/5/26 13:46, Liang Chen 写道: >> >> what made you respond to a patch from May, now? > > I want to apply page_pool to our virtio_net. This virtio_net works on > our device. > > I want to verify whether page_pool on virtio_net with our device can > improve the performance or not. > > And I found that ethtool is wrong. > > I use virtio_net on our device. I found that page member variable in > rq is not used in recv path. > > When virtio_net is modprobe, I checked page member variable in rq with > kprobe or crash tool. page member variable in rq is always NULL. > > But sg in recv path is used. > > So how to use page member variable in rq? If page member variable in > rq is always NULL, can we remove it? > > BTW, I use ping and iperf tool to make tests with virtio_net. In the > tests, page member variable in rq is always NULL. And I replaced page member variable in rq with page_pool, but the statistics of page_pool are always 0. It is interesting that page_pool member variable in rq is not used in ping and iperf tests. I am not sure what tests can make page member variable not NULL. ^_^ Best Regards, Zhu Yanjun > > It is interesting. > > Zhu Yanjun > >>
On Wed, 29 Nov 2023 23:29:10 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote: > > 在 2023/11/29 23:22, Zhu Yanjun 写道: > > > > 在 2023/11/29 22:59, Michael S. Tsirkin 写道: > >> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote: > >>> 在 2023/5/26 13:46, Liang Chen 写道: > >> > >> what made you respond to a patch from May, now? > > > > I want to apply page_pool to our virtio_net. This virtio_net works on > > our device. > > > > I want to verify whether page_pool on virtio_net with our device can > > improve the performance or not. > > > > And I found that ethtool is wrong. > > > > I use virtio_net on our device. I found that page member variable in > > rq is not used in recv path. > > > > When virtio_net is modprobe, I checked page member variable in rq with > > kprobe or crash tool. page member variable in rq is always NULL. > > > > But sg in recv path is used. > > > > So how to use page member variable in rq? If page member variable in > > rq is always NULL, can we remove it? > > > > BTW, I use ping and iperf tool to make tests with virtio_net. In the > > tests, page member variable in rq is always NULL. > > > And I replaced page member variable in rq with page_pool, but the > statistics of page_pool are always 0. > > It is interesting that page_pool member variable in rq is not used in > ping and iperf tests. > > I am not sure what tests can make page member variable not NULL. ^_^ Do you mean rq->pages? That is for big mode. Thanks. > > Best Regards, > > Zhu Yanjun > > > > > > It is interesting. > > > > Zhu Yanjun > > > >>
在 2023/11/30 10:34, Xuan Zhuo 写道: > On Wed, 29 Nov 2023 23:29:10 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote: >> 在 2023/11/29 23:22, Zhu Yanjun 写道: >>> 在 2023/11/29 22:59, Michael S. Tsirkin 写道: >>>> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote: >>>>> 在 2023/5/26 13:46, Liang Chen 写道: >>>> what made you respond to a patch from May, now? >>> I want to apply page_pool to our virtio_net. This virtio_net works on >>> our device. >>> >>> I want to verify whether page_pool on virtio_net with our device can >>> improve the performance or not. >>> >>> And I found that ethtool is wrong. >>> >>> I use virtio_net on our device. I found that page member variable in >>> rq is not used in recv path. >>> >>> When virtio_net is modprobe, I checked page member variable in rq with >>> kprobe or crash tool. page member variable in rq is always NULL. >>> >>> But sg in recv path is used. >>> >>> So how to use page member variable in rq? If page member variable in >>> rq is always NULL, can we remove it? >>> >>> BTW, I use ping and iperf tool to make tests with virtio_net. In the >>> tests, page member variable in rq is always NULL. >> >> And I replaced page member variable in rq with page_pool, but the >> statistics of page_pool are always 0. >> >> It is interesting that page_pool member variable in rq is not used in >> ping and iperf tests. >> >> I am not sure what tests can make page member variable not NULL. ^_^ > Do you mean rq->pages? > > That is for big mode. Hi, Xuan Got it. What is big mode? Do you mean big packet size? I run iperf with the packet size 2^23. The rq->pages is still NULL. It is interesting. Zhu Yanjun > > Thanks. > > >> Best Regards, >> >> Zhu Yanjun >> >> >>> It is interesting. >>> >>> Zhu Yanjun >>>
在 2023/5/26 13:46, Liang Chen 写道: > The implementation at the moment uses one page per packet in both the > normal and XDP path. In addition, introducing a module parameter to enable > or disable the usage of page pool (disabled by default). > > In single-core vm testing environments, it gives a modest performance gain > in the normal path. > Upstream codebase: 47.5 Gbits/sec > Upstream codebase + page_pool support: 50.2 Gbits/sec > > In multi-core vm testing environments, The most significant performance > gain is observed in XDP cpumap: > Upstream codebase: 1.38 Gbits/sec > Upstream codebase + page_pool support: 9.74 Gbits/sec > > With this foundation, we can further integrate page pool fragmentation and > DMA map/unmap support. > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com> > --- > drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- > 1 file changed, 146 insertions(+), 42 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index c5dca0d92e64..99c0ca0c1781 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > module_param(gso, bool, 0444); > module_param(napi_tx, bool, 0644); > > +static bool page_pool_enabled; > +module_param(page_pool_enabled, bool, 0400); > + > /* FIXME: MTU in config. */ > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > #define GOOD_COPY_LEN 128 > @@ -159,6 +162,9 @@ struct receive_queue { > /* Chain pages by the private ptr. */ > struct page *pages; > > + /* Page pool */ > + struct page_pool *page_pool; > + > /* Average packet length for mergeable receive buffers. */ > struct ewma_pkt_len mrg_avg_pkt_len; > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, > return skb; > } > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > +{ > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + put_page(page); > +} > + > /* Called from bottom half context */ > static struct sk_buff *page_to_skb(struct virtnet_info *vi, > struct receive_queue *rq, > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, > hdr = skb_vnet_hdr(skb); > memcpy(hdr, hdr_p, hdr_len); > if (page_to_free) > - put_page(page_to_free); > + virtnet_put_page(rq, page_to_free); > > return skb; > } > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > return ret; > } > > -static void put_xdp_frags(struct xdp_buff *xdp) > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > { > struct skb_shared_info *shinfo; > struct page *xdp_page; > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > shinfo = xdp_get_shared_info_from_buff(xdp); > for (i = 0; i < shinfo->nr_frags; i++) { > xdp_page = skb_frag_page(&shinfo->frags[i]); > - put_page(xdp_page); > + virtnet_put_page(rq, xdp_page); > } > } > } > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > if (page_off + *len + tailroom > PAGE_SIZE) > return NULL; > > - page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + page = alloc_page(GFP_ATOMIC); > + > if (!page) > return NULL; > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, > * is sending packet larger than the MTU. > */ > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > - put_page(p); > + virtnet_put_page(rq, p); > goto err_buf; > } > > memcpy(page_address(page) + page_off, > page_address(p) + off, buflen); > page_off += buflen; > - put_page(p); > + virtnet_put_page(rq, p); > } > > /* Headroom does not contribute to packet length */ > *len = page_off - VIRTIO_XDP_HEADROOM; > return page; > err_buf: > - __free_pages(page, 0); > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + __free_pages(page, 0); > return NULL; > } > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, > } > stats->bytes += len; > page = virt_to_head_page(buf); > - put_page(page); > + virtnet_put_page(rq, page); > } > } > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > cur_frag_size = truesize; > xdp_frags_truesz += cur_frag_size; > if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { > - put_page(page); > + virtnet_put_page(rq, page); > pr_debug("%s: rx error: len %u exceeds truesize %lu\n", > dev->name, len, (unsigned long)(truesize - room)); > dev->stats.rx_length_errors++; > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, > return 0; > > err: > - put_xdp_frags(xdp); > + put_xdp_frags(xdp, rq); > return -EINVAL; > } > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > if (*len + xdp_room > PAGE_SIZE) > return NULL; > > - xdp_page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + xdp_page = alloc_page(GFP_ATOMIC); > if (!xdp_page) > return NULL; > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, > > *frame_sz = PAGE_SIZE; > > - put_page(*page); > + virtnet_put_page(rq, *page); > > *page = xdp_page; > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); > if (unlikely(!head_skb)) > break; > + if (rq->page_pool) > + skb_mark_for_recycle(head_skb); > return head_skb; > > case XDP_TX: > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, > break; > } > > - put_xdp_frags(&xdp); > + put_xdp_frags(&xdp, rq); > > err_xdp: > - put_page(page); > + virtnet_put_page(rq, page); > mergeable_buf_free(rq, num_buf, dev, stats); > > stats->xdp_drops++; > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); > curr_skb = head_skb; > > + if (rq->page_pool) > + skb_mark_for_recycle(curr_skb); > + > if (unlikely(!curr_skb)) > goto err_skb; > while (--num_buf) { > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > curr_skb = nskb; > head_skb->truesize += nskb->truesize; > num_skb_frags = 0; > + if (rq->page_pool) > + skb_mark_for_recycle(curr_skb); > } > if (curr_skb != head_skb) { > head_skb->data_len += len; > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > } > offset = buf - page_address(page); > if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { > - put_page(page); > + virtnet_put_page(rq, page); > skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, > len, truesize); > } else { > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, > return head_skb; > > err_skb: > - put_page(page); > + virtnet_put_page(rq, page); > mergeable_buf_free(rq, num_buf, dev, stats); > > err_buf: > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, > * disabled GSO for XDP, it won't be a big issue. > */ > len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); > - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > - return -ENOMEM; > + if (rq->page_pool) { > + struct page *page; > > - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > - buf += headroom; /* advance address leaving hole at front of pkt */ > - get_page(alloc_frag->page); > - alloc_frag->offset += len + room; > - hole = alloc_frag->size - alloc_frag->offset; > - if (hole < len + room) { > - /* To avoid internal fragmentation, if there is very likely not > - * enough space for another buffer, add the remaining space to > - * the current buffer. > - * XDP core assumes that frame_size of xdp_buff and the length > - * of the frag are PAGE_SIZE, so we disable the hole mechanism. > - */ > - if (!headroom) > - len += hole; > - alloc_frag->offset += hole; > - } > + page = page_pool_dev_alloc_pages(rq->page_pool); > + if (unlikely(!page)) > + return -ENOMEM; > + buf = (char *)page_address(page); > + buf += headroom; /* advance address leaving hole at front of pkt */ > + } else { > + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) > + return -ENOMEM; > > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; > + buf += headroom; /* advance address leaving hole at front of pkt */ > + get_page(alloc_frag->page); > + alloc_frag->offset += len + room; > + hole = alloc_frag->size - alloc_frag->offset; > + if (hole < len + room) { > + /* To avoid internal fragmentation, if there is very likely not > + * enough space for another buffer, add the remaining space to > + * the current buffer. > + * XDP core assumes that frame_size of xdp_buff and the length > + * of the frag are PAGE_SIZE, so we disable the hole mechanism. > + */ > + if (!headroom) > + len += hole; > + alloc_frag->offset += hole; > + } > + } > sg_init_one(rq->sg, buf, len); > ctx = mergeable_len_to_ctx(len + room, headroom); > err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); > if (err < 0) > - put_page(virt_to_head_page(buf)); > + virtnet_put_page(rq, virt_to_head_page(buf)); > > return err; > } > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > if (err < 0) > return err; > > - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > - MEM_TYPE_PAGE_SHARED, NULL); > + if (vi->rq[qp_index].page_pool) > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > + MEM_TYPE_PAGE_POOL, > + vi->rq[qp_index].page_pool); > + else > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > + MEM_TYPE_PAGE_SHARED, > + NULL); > + > if (err < 0) > goto err_xdp_reg_mem_model; > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) > ethtool_sprintf(&p, "tx_queue_%u_%s", i, > virtnet_sq_stats_desc[j].desc); > } > + page_pool_ethtool_stats_get_strings(p); > break; > } > } > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) > switch (sset) { > case ETH_SS_STATS: > return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + > - VIRTNET_SQ_STATS_LEN); > + VIRTNET_SQ_STATS_LEN + > + (page_pool_enabled && vi->mergeable_rx_bufs ? > + page_pool_ethtool_stats_get_count() : 0)); > default: > return -EOPNOTSUPP; > } > } > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) > +{ > +#ifdef CONFIG_PAGE_POOL_STATS > + struct virtnet_info *vi = netdev_priv(dev); > + struct page_pool_stats pp_stats = {}; > + int i; > + > + for (i = 0; i < vi->curr_queue_pairs; i++) { > + if (!vi->rq[i].page_pool) > + continue; > + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); > + } > + page_pool_ethtool_stats_get(data, &pp_stats); > +#endif /* CONFIG_PAGE_POOL_STATS */ > +} > + > static void virtnet_get_ethtool_stats(struct net_device *dev, > struct ethtool_stats *stats, u64 *data) > { > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, > } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); > idx += VIRTNET_SQ_STATS_LEN; > } > + > + virtnet_get_page_pool_stats(dev, &data[idx]); virtnet_get_page_pool_stats(dev, &data[idx]); should be moved to between SQ and RQ. Combined with the correct page_pool_ethtool_stats_get_count, ethtool can get page_pool statistics. But all the values about page_pool are zero. It is interesting. I will delve into this and find out why. Zhu Yanjun > } > > static void virtnet_get_channels(struct net_device *dev, > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) > for (i = 0; i < vi->max_queue_pairs; i++) { > __netif_napi_del(&vi->rq[i].napi); > __netif_napi_del(&vi->sq[i].napi); > + if (vi->rq[i].page_pool) > + page_pool_destroy(vi->rq[i].page_pool); > } > > /* We called __netif_napi_del(), > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) > struct virtnet_info *vi = vq->vdev->priv; > int i = vq2rxq(vq); > > - if (vi->mergeable_rx_bufs) > - put_page(virt_to_head_page(buf)); > - else if (vi->big_packets) > + if (vi->mergeable_rx_bufs) { > + if (vi->rq[i].page_pool) { > + page_pool_put_full_page(vi->rq[i].page_pool, > + virt_to_head_page(buf), > + true); > + } else { > + put_page(virt_to_head_page(buf)); > + } > + } else if (vi->big_packets) { > give_pages(&vi->rq[i], buf); > - else > + } else { > put_page(virt_to_head_page(buf)); > + } > } > > static void free_unused_bufs(struct virtnet_info *vi) > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > virtnet_free_queues(vi); > } > > +static void virtnet_alloc_page_pool(struct receive_queue *rq) > +{ > + struct virtio_device *vdev = rq->vq->vdev; > + > + struct page_pool_params pp_params = { > + .order = 0, > + .pool_size = rq->vq->num_max, > + .nid = dev_to_node(vdev->dev.parent), > + .dev = vdev->dev.parent, > + .offset = 0, > + }; > + > + rq->page_pool = page_pool_create(&pp_params); > + if (IS_ERR(rq->page_pool)) { > + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > + PTR_ERR(rq->page_pool)); > + rq->page_pool = NULL; > + } > +} > + > /* How large should a single buffer be so a queue full of these can fit at > * least one full packet? > * Logic below assumes the mergeable buffer header is used. > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > vi->rq[i].vq = vqs[rxq2vq(i)]; > vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); > vi->sq[i].vq = vqs[txq2vq(i)]; > + > + if (page_pool_enabled && vi->mergeable_rx_bufs) > + virtnet_alloc_page_pool(&vi->rq[i]); > + else > + dev_warn(&vi->vdev->dev, > + "page pool only support mergeable mode\n"); > + > } > > /* run here: ret == 0. */
On Thu, 30 Nov 2023 13:30:40 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote: > > 在 2023/11/30 10:34, Xuan Zhuo 写道: > > On Wed, 29 Nov 2023 23:29:10 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote: > >> 在 2023/11/29 23:22, Zhu Yanjun 写道: > >>> 在 2023/11/29 22:59, Michael S. Tsirkin 写道: > >>>> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote: > >>>>> 在 2023/5/26 13:46, Liang Chen 写道: > >>>> what made you respond to a patch from May, now? > >>> I want to apply page_pool to our virtio_net. This virtio_net works on > >>> our device. > >>> > >>> I want to verify whether page_pool on virtio_net with our device can > >>> improve the performance or not. > >>> > >>> And I found that ethtool is wrong. > >>> > >>> I use virtio_net on our device. I found that page member variable in > >>> rq is not used in recv path. > >>> > >>> When virtio_net is modprobe, I checked page member variable in rq with > >>> kprobe or crash tool. page member variable in rq is always NULL. > >>> > >>> But sg in recv path is used. > >>> > >>> So how to use page member variable in rq? If page member variable in > >>> rq is always NULL, can we remove it? > >>> > >>> BTW, I use ping and iperf tool to make tests with virtio_net. In the > >>> tests, page member variable in rq is always NULL. > >> > >> And I replaced page member variable in rq with page_pool, but the > >> statistics of page_pool are always 0. > >> > >> It is interesting that page_pool member variable in rq is not used in > >> ping and iperf tests. > >> > >> I am not sure what tests can make page member variable not NULL. ^_^ > > Do you mean rq->pages? > > > > That is for big mode. > > Hi, Xuan > > Got it. What is big mode? Do you mean big packet size? I run iperf with > the packet size 2^23. > > The rq->pages is still NULL. > > It is interesting. You may need to check the code of virtnet_probe(). Thanks. > > Zhu Yanjun > > > > > > Thanks. > > > > > >> Best Regards, > >> > >> Zhu Yanjun > >> > >> > >>> It is interesting. > >>> > >>> Zhu Yanjun > >>>
在 2023/12/1 9:38, Xuan Zhuo 写道: > On Thu, 30 Nov 2023 13:30:40 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote: >> >> 在 2023/11/30 10:34, Xuan Zhuo 写道: >>> On Wed, 29 Nov 2023 23:29:10 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote: >>>> 在 2023/11/29 23:22, Zhu Yanjun 写道: >>>>> 在 2023/11/29 22:59, Michael S. Tsirkin 写道: >>>>>> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote: >>>>>>> 在 2023/5/26 13:46, Liang Chen 写道: >>>>>> what made you respond to a patch from May, now? >>>>> I want to apply page_pool to our virtio_net. This virtio_net works on >>>>> our device. >>>>> >>>>> I want to verify whether page_pool on virtio_net with our device can >>>>> improve the performance or not. >>>>> >>>>> And I found that ethtool is wrong. >>>>> >>>>> I use virtio_net on our device. I found that page member variable in >>>>> rq is not used in recv path. >>>>> >>>>> When virtio_net is modprobe, I checked page member variable in rq with >>>>> kprobe or crash tool. page member variable in rq is always NULL. >>>>> >>>>> But sg in recv path is used. >>>>> >>>>> So how to use page member variable in rq? If page member variable in >>>>> rq is always NULL, can we remove it? >>>>> >>>>> BTW, I use ping and iperf tool to make tests with virtio_net. In the >>>>> tests, page member variable in rq is always NULL. >>>> >>>> And I replaced page member variable in rq with page_pool, but the >>>> statistics of page_pool are always 0. >>>> >>>> It is interesting that page_pool member variable in rq is not used in >>>> ping and iperf tests. >>>> >>>> I am not sure what tests can make page member variable not NULL. ^_^ >>> Do you mean rq->pages? >>> >>> That is for big mode. >> >> Hi, Xuan >> >> Got it. What is big mode? Do you mean big packet size? I run iperf with >> the packet size 2^23. >> >> The rq->pages is still NULL. >> >> It is interesting. > > You may need to check the code of virtnet_probe(). Thanks a lot. From virtnet_probe, big mode and mergeable mode can be found. Zhu Yanjun > > Thanks. > > >> >> Zhu Yanjun >> >> >>> >>> Thanks. >>> >>> >>>> Best Regards, >>>> >>>> Zhu Yanjun >>>> >>>> >>>>> It is interesting. >>>>> >>>>> Zhu Yanjun >>>>>
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c5dca0d92e64..99c0ca0c1781 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); module_param(gso, bool, 0444); module_param(napi_tx, bool, 0644); +static bool page_pool_enabled; +module_param(page_pool_enabled, bool, 0400); + /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 @@ -159,6 +162,9 @@ struct receive_queue { /* Chain pages by the private ptr. */ struct page *pages; + /* Page pool */ + struct page_pool *page_pool; + /* Average packet length for mergeable receive buffers. */ struct ewma_pkt_len mrg_avg_pkt_len; @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, return skb; } +static void virtnet_put_page(struct receive_queue *rq, struct page *page) +{ + if (rq->page_pool) + page_pool_put_full_page(rq->page_pool, page, true); + else + put_page(page); +} + /* Called from bottom half context */ static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, hdr = skb_vnet_hdr(skb); memcpy(hdr, hdr_p, hdr_len); if (page_to_free) - put_page(page_to_free); + virtnet_put_page(rq, page_to_free); return skb; } @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, return ret; } -static void put_xdp_frags(struct xdp_buff *xdp) +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) { struct skb_shared_info *shinfo; struct page *xdp_page; @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) shinfo = xdp_get_shared_info_from_buff(xdp); for (i = 0; i < shinfo->nr_frags; i++) { xdp_page = skb_frag_page(&shinfo->frags[i]); - put_page(xdp_page); + virtnet_put_page(rq, xdp_page); } } } @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, if (page_off + *len + tailroom > PAGE_SIZE) return NULL; - page = alloc_page(GFP_ATOMIC); + if (rq->page_pool) + page = page_pool_dev_alloc_pages(rq->page_pool); + else + page = alloc_page(GFP_ATOMIC); + if (!page) return NULL; @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, * is sending packet larger than the MTU. */ if ((page_off + buflen + tailroom) > PAGE_SIZE) { - put_page(p); + virtnet_put_page(rq, p); goto err_buf; } memcpy(page_address(page) + page_off, page_address(p) + off, buflen); page_off += buflen; - put_page(p); + virtnet_put_page(rq, p); } /* Headroom does not contribute to packet length */ *len = page_off - VIRTIO_XDP_HEADROOM; return page; err_buf: - __free_pages(page, 0); + if (rq->page_pool) + page_pool_put_full_page(rq->page_pool, page, true); + else + __free_pages(page, 0); return NULL; } @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf, } stats->bytes += len; page = virt_to_head_page(buf); - put_page(page); + virtnet_put_page(rq, page); } } @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, cur_frag_size = truesize; xdp_frags_truesz += cur_frag_size; if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) { - put_page(page); + virtnet_put_page(rq, page); pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)(truesize - room)); dev->stats.rx_length_errors++; @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, return 0; err: - put_xdp_frags(xdp); + put_xdp_frags(xdp, rq); return -EINVAL; } @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, if (*len + xdp_room > PAGE_SIZE) return NULL; - xdp_page = alloc_page(GFP_ATOMIC); + if (rq->page_pool) + xdp_page = page_pool_dev_alloc_pages(rq->page_pool); + else + xdp_page = alloc_page(GFP_ATOMIC); if (!xdp_page) return NULL; @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi, *frame_sz = PAGE_SIZE; - put_page(*page); + virtnet_put_page(rq, *page); *page = xdp_page; @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); if (unlikely(!head_skb)) break; + if (rq->page_pool) + skb_mark_for_recycle(head_skb); return head_skb; case XDP_TX: @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, break; } - put_xdp_frags(&xdp); + put_xdp_frags(&xdp, rq); err_xdp: - put_page(page); + virtnet_put_page(rq, page); mergeable_buf_free(rq, num_buf, dev, stats); stats->xdp_drops++; @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); curr_skb = head_skb; + if (rq->page_pool) + skb_mark_for_recycle(curr_skb); + if (unlikely(!curr_skb)) goto err_skb; while (--num_buf) { @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, curr_skb = nskb; head_skb->truesize += nskb->truesize; num_skb_frags = 0; + if (rq->page_pool) + skb_mark_for_recycle(curr_skb); } if (curr_skb != head_skb) { head_skb->data_len += len; @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } offset = buf - page_address(page); if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { - put_page(page); + virtnet_put_page(rq, page); skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, len, truesize); } else { @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, return head_skb; err_skb: - put_page(page); + virtnet_put_page(rq, page); mergeable_buf_free(rq, num_buf, dev, stats); err_buf: @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, * disabled GSO for XDP, it won't be a big issue. */ len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); - if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) - return -ENOMEM; + if (rq->page_pool) { + struct page *page; - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; - buf += headroom; /* advance address leaving hole at front of pkt */ - get_page(alloc_frag->page); - alloc_frag->offset += len + room; - hole = alloc_frag->size - alloc_frag->offset; - if (hole < len + room) { - /* To avoid internal fragmentation, if there is very likely not - * enough space for another buffer, add the remaining space to - * the current buffer. - * XDP core assumes that frame_size of xdp_buff and the length - * of the frag are PAGE_SIZE, so we disable the hole mechanism. - */ - if (!headroom) - len += hole; - alloc_frag->offset += hole; - } + page = page_pool_dev_alloc_pages(rq->page_pool); + if (unlikely(!page)) + return -ENOMEM; + buf = (char *)page_address(page); + buf += headroom; /* advance address leaving hole at front of pkt */ + } else { + if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) + return -ENOMEM; + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; + buf += headroom; /* advance address leaving hole at front of pkt */ + get_page(alloc_frag->page); + alloc_frag->offset += len + room; + hole = alloc_frag->size - alloc_frag->offset; + if (hole < len + room) { + /* To avoid internal fragmentation, if there is very likely not + * enough space for another buffer, add the remaining space to + * the current buffer. + * XDP core assumes that frame_size of xdp_buff and the length + * of the frag are PAGE_SIZE, so we disable the hole mechanism. + */ + if (!headroom) + len += hole; + alloc_frag->offset += hole; + } + } sg_init_one(rq->sg, buf, len); ctx = mergeable_len_to_ctx(len + room, headroom); err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) - put_page(virt_to_head_page(buf)); + virtnet_put_page(rq, virt_to_head_page(buf)); return err; } @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) if (err < 0) return err; - err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, - MEM_TYPE_PAGE_SHARED, NULL); + if (vi->rq[qp_index].page_pool) + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, + MEM_TYPE_PAGE_POOL, + vi->rq[qp_index].page_pool); + else + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL); + if (err < 0) goto err_xdp_reg_mem_model; @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) ethtool_sprintf(&p, "tx_queue_%u_%s", i, virtnet_sq_stats_desc[j].desc); } + page_pool_ethtool_stats_get_strings(p); break; } } @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset) switch (sset) { case ETH_SS_STATS: return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + - VIRTNET_SQ_STATS_LEN); + VIRTNET_SQ_STATS_LEN + + (page_pool_enabled && vi->mergeable_rx_bufs ? + page_pool_ethtool_stats_get_count() : 0)); default: return -EOPNOTSUPP; } } +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct virtnet_info *vi = netdev_priv(dev); + struct page_pool_stats pp_stats = {}; + int i; + + for (i = 0; i < vi->curr_queue_pairs; i++) { + if (!vi->rq[i].page_pool) + continue; + page_pool_get_stats(vi->rq[i].page_pool, &pp_stats); + } + page_pool_ethtool_stats_get(data, &pp_stats); +#endif /* CONFIG_PAGE_POOL_STATS */ +} + static void virtnet_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev, } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); idx += VIRTNET_SQ_STATS_LEN; } + + virtnet_get_page_pool_stats(dev, &data[idx]); } static void virtnet_get_channels(struct net_device *dev, @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi) for (i = 0; i < vi->max_queue_pairs; i++) { __netif_napi_del(&vi->rq[i].napi); __netif_napi_del(&vi->sq[i].napi); + if (vi->rq[i].page_pool) + page_pool_destroy(vi->rq[i].page_pool); } /* We called __netif_napi_del(), @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) struct virtnet_info *vi = vq->vdev->priv; int i = vq2rxq(vq); - if (vi->mergeable_rx_bufs) - put_page(virt_to_head_page(buf)); - else if (vi->big_packets) + if (vi->mergeable_rx_bufs) { + if (vi->rq[i].page_pool) { + page_pool_put_full_page(vi->rq[i].page_pool, + virt_to_head_page(buf), + true); + } else { + put_page(virt_to_head_page(buf)); + } + } else if (vi->big_packets) { give_pages(&vi->rq[i], buf); - else + } else { put_page(virt_to_head_page(buf)); + } } static void free_unused_bufs(struct virtnet_info *vi) @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi) virtnet_free_queues(vi); } +static void virtnet_alloc_page_pool(struct receive_queue *rq) +{ + struct virtio_device *vdev = rq->vq->vdev; + + struct page_pool_params pp_params = { + .order = 0, + .pool_size = rq->vq->num_max, + .nid = dev_to_node(vdev->dev.parent), + .dev = vdev->dev.parent, + .offset = 0, + }; + + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->page_pool)) { + dev_warn(&vdev->dev, "page pool creation failed: %ld\n", + PTR_ERR(rq->page_pool)); + rq->page_pool = NULL; + } +} + /* How large should a single buffer be so a queue full of these can fit at * least one full packet? * Logic below assumes the mergeable buffer header is used. @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi) vi->rq[i].vq = vqs[rxq2vq(i)]; vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); vi->sq[i].vq = vqs[txq2vq(i)]; + + if (page_pool_enabled && vi->mergeable_rx_bufs) + virtnet_alloc_page_pool(&vi->rq[i]); + else + dev_warn(&vi->vdev->dev, + "page pool only support mergeable mode\n"); + } /* run here: ret == 0. */
The implementation at the moment uses one page per packet in both the normal and XDP path. In addition, introducing a module parameter to enable or disable the usage of page pool (disabled by default). In single-core vm testing environments, it gives a modest performance gain in the normal path. Upstream codebase: 47.5 Gbits/sec Upstream codebase + page_pool support: 50.2 Gbits/sec In multi-core vm testing environments, The most significant performance gain is observed in XDP cpumap: Upstream codebase: 1.38 Gbits/sec Upstream codebase + page_pool support: 9.74 Gbits/sec With this foundation, we can further integrate page pool fragmentation and DMA map/unmap support. Signed-off-by: Liang Chen <liangchen.linux@gmail.com> --- drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++--------- 1 file changed, 146 insertions(+), 42 deletions(-)