diff mbox series

[net-next,2/2] tun: AF_XDP Rx zero-copy support

Message ID 1706089075-16084-1-git-send-email-wangyunjian@huawei.com (mailing list archive)
State New, archived
Headers show
Series tun: AF_XDP Rx zero-copy support | expand

Commit Message

wangyunjian Jan. 24, 2024, 9:37 a.m. UTC
Now the zero-copy feature of AF_XDP socket is supported by some
drivers, which can reduce CPU utilization on the xdp program.
This patch set allows tun to support AF_XDP Rx zero-copy feature.

This patch tries to address this by:
- Use peek_len to consume a xsk->desc and get xsk->desc length.
- When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
So add a check for empty vq's array in vhost_net_buf_produce().
- add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
- add tun_put_user_desc function to copy the Rx data to VM

Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
---
 drivers/net/tun.c   | 165 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/net.c |  18 +++--
 2 files changed, 176 insertions(+), 7 deletions(-)

Comments

Willem de Bruijn Jan. 24, 2024, 7:05 p.m. UTC | #1
Yunjian Wang wrote:
> Now the zero-copy feature of AF_XDP socket is supported by some
> drivers, which can reduce CPU utilization on the xdp program.
> This patch set allows tun to support AF_XDP Rx zero-copy feature.
> 
> This patch tries to address this by:
> - Use peek_len to consume a xsk->desc and get xsk->desc length.
> - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> So add a check for empty vq's array in vhost_net_buf_produce().
> - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> - add tun_put_user_desc function to copy the Rx data to VM
> 
> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>

I don't fully understand the higher level design of this feature yet.

But some initial comments at the code level.

> ---
>  drivers/net/tun.c   | 165 +++++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/net.c |  18 +++--
>  2 files changed, 176 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index afa5497f7c35..248b0f8e07d1 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -77,6 +77,7 @@
>  #include <net/ax25.h>
>  #include <net/rose.h>
>  #include <net/6lowpan.h>
> +#include <net/xdp_sock_drv.h>
>  
>  #include <linux/uaccess.h>
>  #include <linux/proc_fs.h>
> @@ -145,6 +146,10 @@ struct tun_file {
>  	struct tun_struct *detached;
>  	struct ptr_ring tx_ring;
>  	struct xdp_rxq_info xdp_rxq;
> +	struct xdp_desc desc;
> +	/* protects xsk pool */
> +	spinlock_t pool_lock;
> +	struct xsk_buff_pool *pool;
>  };
>  
>  struct tun_page {
> @@ -208,6 +213,8 @@ struct tun_struct {
>  	struct bpf_prog __rcu *xdp_prog;
>  	struct tun_prog __rcu *steering_prog;
>  	struct tun_prog __rcu *filter_prog;
> +	/* tracks AF_XDP ZC enabled queues */
> +	unsigned long *af_xdp_zc_qps;
>  	struct ethtool_link_ksettings link_ksettings;
>  	/* init args */
>  	struct file *file;
> @@ -795,6 +802,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>  
>  	tfile->queue_index = tun->numqueues;
>  	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
> +	tfile->desc.len = 0;
> +	tfile->pool = NULL;
>  
>  	if (tfile->detached) {
>  		/* Re-attach detached tfile, updating XDP queue_index */
> @@ -989,6 +998,13 @@ static int tun_net_init(struct net_device *dev)
>  		return err;
>  	}
>  
> +	tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES, GFP_KERNEL);
> +	if (!tun->af_xdp_zc_qps) {
> +		security_tun_dev_free_security(tun->security);
> +		free_percpu(dev->tstats);
> +		return -ENOMEM;
> +	}
> +
>  	tun_flow_init(tun);
>  
>  	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> @@ -1009,6 +1025,7 @@ static int tun_net_init(struct net_device *dev)
>  		tun_flow_uninit(tun);
>  		security_tun_dev_free_security(tun->security);
>  		free_percpu(dev->tstats);
> +		bitmap_free(tun->af_xdp_zc_qps);

Please release state in inverse order of acquire.

>  		return err;
>  	}
>  	return 0;
> @@ -1222,11 +1239,77 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>  	return 0;
>  }
>  
> +static int tun_xsk_pool_enable(struct net_device *netdev,
> +			       struct xsk_buff_pool *pool,
> +			       u16 qid)
> +{
> +	struct tun_struct *tun = netdev_priv(netdev);
> +	struct tun_file *tfile;
> +	unsigned long flags;
> +
> +	rcu_read_lock();
> +	tfile = rtnl_dereference(tun->tfiles[qid]);
> +	if (!tfile) {
> +		rcu_read_unlock();
> +		return -ENODEV;
> +	}

No need for rcu_read_lock with rtnl_dereference.

Consider ASSERT_RTNL() if unsure whether this patch could be reached
without the rtnl held.

> +
> +	spin_lock_irqsave(&tfile->pool_lock, flags);
> +	xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> +	tfile->pool = pool;
> +	spin_unlock_irqrestore(&tfile->pool_lock, flags);
> +
> +	rcu_read_unlock();
> +	set_bit(qid, tun->af_xdp_zc_qps);

What are the concurrency semantics: there's a spinlock to make
the update to xdp_rxq and pool a critical section, but the bitmap
is not part of this? Please also then document why the irqsave.

> +
> +	return 0;
> +}
> +
> +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
> +{
> +	struct tun_struct *tun = netdev_priv(netdev);
> +	struct tun_file *tfile;
> +	unsigned long flags;
> +
> +	if (!test_bit(qid, tun->af_xdp_zc_qps))
> +		return 0;
> +
> +	clear_bit(qid, tun->af_xdp_zc_qps);

Time of check to time of use race between test and clear? Or is
there no race because anything that clears will hold the RTNL? If so,
please add a comment.

> +
> +	rcu_read_lock();
> +	tfile = rtnl_dereference(tun->tfiles[qid]);
> +	if (!tfile) {
> +		rcu_read_unlock();
> +		return 0;
> +	}
> +
> +	spin_lock_irqsave(&tfile->pool_lock, flags);
> +	if (tfile->desc.len) {
> +		xsk_tx_completed(tfile->pool, 1);
> +		tfile->desc.len = 0;
> +	}
> +	tfile->pool = NULL;
> +	spin_unlock_irqrestore(&tfile->pool_lock, flags);
> +
> +	rcu_read_unlock();
> +	return 0;
> +}
> +
> +int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
> +		       u16 qid)
> +{
> +	return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> +		tun_xsk_pool_disable(dev, qid);
> +}
> +
>  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  {
>  	switch (xdp->command) {
>  	case XDP_SETUP_PROG:
>  		return tun_xdp_set(dev, xdp->prog, xdp->extack);
> +	case XDP_SETUP_XSK_POOL:
> +		return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> +					   xdp->xsk.queue_id);
>  	default:
>  		return -EINVAL;
>  	}
> @@ -1331,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
>  	return nxmit;
>  }
>  
> +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> +{
> +	struct tun_struct *tun = netdev_priv(dev);
> +	struct tun_file *tfile;
> +
> +	rcu_read_lock();
> +	tfile = rcu_dereference(tun->tfiles[qid]);
> +	if (tfile)
> +		__tun_xdp_flush_tfile(tfile);
> +	rcu_read_unlock();
> +	return 0;
> +}
> +
>  static const struct net_device_ops tap_netdev_ops = {
>  	.ndo_init		= tun_net_init,
>  	.ndo_uninit		= tun_net_uninit,
> @@ -1347,6 +1443,7 @@ static const struct net_device_ops tap_netdev_ops = {
>  	.ndo_get_stats64	= dev_get_tstats64,
>  	.ndo_bpf		= tun_xdp,
>  	.ndo_xdp_xmit		= tun_xdp_xmit,
> +	.ndo_xsk_wakeup		= tun_xsk_wakeup,
>  	.ndo_change_carrier	= tun_net_change_carrier,
>  };
>  
> @@ -1404,7 +1501,8 @@ static void tun_net_initialize(struct net_device *dev)
>  		/* Currently tun does not support XDP, only tap does. */
>  		dev->xdp_features = NETDEV_XDP_ACT_BASIC |
>  				    NETDEV_XDP_ACT_REDIRECT |
> -				    NETDEV_XDP_ACT_NDO_XMIT;
> +				    NETDEV_XDP_ACT_NDO_XMIT |
> +				    NETDEV_XDP_ACT_XSK_ZEROCOPY;
>  
>  		break;
>  	}
> @@ -2213,6 +2311,37 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>  	return ptr;
>  }
>  
> +static ssize_t tun_put_user_desc(struct tun_struct *tun,
> +				 struct tun_file *tfile,
> +				 struct xdp_desc *desc,
> +				 struct iov_iter *iter)
> +{
> +	size_t size = desc->len;
> +	int vnet_hdr_sz = 0;
> +	size_t ret;
> +
> +	if (tun->flags & IFF_VNET_HDR) {
> +		struct virtio_net_hdr_mrg_rxbuf gso = { 0 };
> +
> +		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
> +		if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
> +			return -EINVAL;
> +		if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
> +			     sizeof(gso)))
> +			return -EFAULT;
> +		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
> +	}
> +
> +	ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr),
> +			   size, iter) + vnet_hdr_sz;
> +
> +	preempt_disable();
> +	dev_sw_netstats_tx_add(tun->dev, 1, ret);
> +	preempt_enable();
> +
> +	return ret;
> +}
> +

This is almost a copy of tun_put_user_xdp. Can we refactor to avoid
duplicating code (here and elsewhere this may apply).

>  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>  			   struct iov_iter *to,
>  			   int noblock, void *ptr)
> @@ -2226,6 +2355,22 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>  	}
>  
>  	if (!ptr) {
> +		/* Read frames from xsk's desc */
> +		if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> +			spin_lock(&tfile->pool_lock);
> +			if (tfile->pool) {
> +				ret = tun_put_user_desc(tun, tfile, &tfile->desc, to);
> +				xsk_tx_completed(tfile->pool, 1);
> +				if (xsk_uses_need_wakeup(tfile->pool))
> +					xsk_set_tx_need_wakeup(tfile->pool);

For the xsk maintainers if they're reading along: this two line
pattern is seen quite often. Deserves a helper in a header file?

> +				tfile->desc.len = 0;
> +			} else {
> +				ret = -EBADFD;
> +			}
> +			spin_unlock(&tfile->pool_lock);
> +			return ret;
> +		}
> +
>  		/* Read frames from ring */
>  		ptr = tun_ring_recv(tfile, noblock, &err);
>  		if (!ptr)
> @@ -2311,6 +2456,7 @@ static void tun_free_netdev(struct net_device *dev)
>  
>  	BUG_ON(!(list_empty(&tun->disabled)));
>  
> +	bitmap_free(tun->af_xdp_zc_qps);
>  	free_percpu(dev->tstats);
>  	tun_flow_uninit(tun);
>  	security_tun_dev_free_security(tun->security);
> @@ -2666,7 +2812,19 @@ static int tun_peek_len(struct socket *sock)
>  	if (!tun)
>  		return 0;
>  
> -	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> +	if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> +		spin_lock(&tfile->pool_lock);
> +		if (tfile->pool && xsk_tx_peek_desc(tfile->pool, &tfile->desc)) {
> +			xsk_tx_release(tfile->pool);
> +			ret = tfile->desc.len;
> +			/* The length of desc must be greater than 0 */
> +			if (!ret)
> +				xsk_tx_completed(tfile->pool, 1);

Peek semantics usually don't result in releasing the buffer. Am I
misunderstanding this operation?
> +		}
> +		spin_unlock(&tfile->pool_lock);
> +	} else {
> +		ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> +	}
>  	tun_put(tun);
>  
>  	return ret;
> @@ -3469,8 +3627,11 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>  
>  	mutex_init(&tfile->napi_mutex);
>  	RCU_INIT_POINTER(tfile->tun, NULL);
> +	spin_lock_init(&tfile->pool_lock);
>  	tfile->flags = 0;
>  	tfile->ifindex = 0;
> +	tfile->pool = NULL;
> +	tfile->desc.len = 0;
>  
>  	init_waitqueue_head(&tfile->socket.wq.wait);
>  
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index f2ed7167c848..a1f143ad2341 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c

For virtio maintainer: is it okay to have tun and vhost/net changes in
the same patch, or is it better to split them?

> @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
>  
>  static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
>  {
> -	void *ret = vhost_net_buf_get_ptr(rxq);
> -	++rxq->head;
> -	return ret;
> +	if (rxq->tail == rxq->head)
> +		return NULL;
> +
> +	return rxq->queue[rxq->head++];

Why this change?

>  }
>  
>  static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
> @@ -993,12 +994,19 @@ static void handle_tx(struct vhost_net *net)
>  
>  static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
>  {
> +	struct socket *sock = sk->sk_socket;
>  	struct sk_buff *head;
>  	int len = 0;
>  	unsigned long flags;
>  
> -	if (rvq->rx_ring)
> -		return vhost_net_buf_peek(rvq);
> +	if (rvq->rx_ring) {
> +		len = vhost_net_buf_peek(rvq);
> +		if (likely(len))
> +			return len;
> +	}
> +
> +	if (sock->ops->peek_len)
> +		return sock->ops->peek_len(sock);
>  
>  	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
>  	head = skb_peek(&sk->sk_receive_queue);
> -- 
> 2.33.0
>
Jason Wang Jan. 25, 2024, 3:44 a.m. UTC | #2
On Thu, Jan 25, 2024 at 3:05 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Yunjian Wang wrote:
> > Now the zero-copy feature of AF_XDP socket is supported by some
> > drivers, which can reduce CPU utilization on the xdp program.
> > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> >
> > This patch tries to address this by:
> > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> > So add a check for empty vq's array in vhost_net_buf_produce().
> > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > - add tun_put_user_desc function to copy the Rx data to VM
> >
> > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
>
> I don't fully understand the higher level design of this feature yet.
>
> But some initial comments at the code level.
>
> > ---
> >  drivers/net/tun.c   | 165 +++++++++++++++++++++++++++++++++++++++++++-
> >  drivers/vhost/net.c |  18 +++--
> >  2 files changed, 176 insertions(+), 7 deletions(-)
> >

[...]

> >  struct tun_page {
> > @@ -208,6 +21
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index f2ed7167c848..a1f143ad2341 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
>
> For virtio maintainer: is it okay to have tun and vhost/net changes in
> the same patch, or is it better to split them?

It's better to split, but as you comment below, if it must be done in
one patch we need to explain why.

>
> > @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
> >
> >  static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
> >  {
> > -     void *ret = vhost_net_buf_get_ptr(rxq);
> > -     ++rxq->head;
> > -     return ret;
> > +     if (rxq->tail == rxq->head)
> > +             return NULL;
> > +
> > +     return rxq->queue[rxq->head++];
>
> Why this change?

Thanks
Jason Wang Jan. 25, 2024, 4:48 a.m. UTC | #3
On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang <wangyunjian@huawei.com> wrote:
>
> Now the zero-copy feature of AF_XDP socket is supported by some
> drivers, which can reduce CPU utilization on the xdp program.
> This patch set allows tun to support AF_XDP Rx zero-copy feature.
>
> This patch tries to address this by:
> - Use peek_len to consume a xsk->desc and get xsk->desc length.
> - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> So add a check for empty vq's array in vhost_net_buf_produce().
> - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> - add tun_put_user_desc function to copy the Rx data to VM

Code explains themselves, let's explain why you need to do this.

1) why you want to use peek_len
2) for "vq's array", what does it mean?
3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so I
guess you meant TX zerocopy instead of RX (as I don't see codes for
RX?)

A big question is how could you handle GSO packets from userspace/guests?

>
> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> ---
>  drivers/net/tun.c   | 165 +++++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/net.c |  18 +++--
>  2 files changed, 176 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index afa5497f7c35..248b0f8e07d1 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -77,6 +77,7 @@
>  #include <net/ax25.h>
>  #include <net/rose.h>
>  #include <net/6lowpan.h>
> +#include <net/xdp_sock_drv.h>
>
>  #include <linux/uaccess.h>
>  #include <linux/proc_fs.h>
> @@ -145,6 +146,10 @@ struct tun_file {
>         struct tun_struct *detached;
>         struct ptr_ring tx_ring;
>         struct xdp_rxq_info xdp_rxq;
> +       struct xdp_desc desc;
> +       /* protects xsk pool */
> +       spinlock_t pool_lock;
> +       struct xsk_buff_pool *pool;
>  };
>
>  struct tun_page {
> @@ -208,6 +213,8 @@ struct tun_struct {
>         struct bpf_prog __rcu *xdp_prog;
>         struct tun_prog __rcu *steering_prog;
>         struct tun_prog __rcu *filter_prog;
> +       /* tracks AF_XDP ZC enabled queues */
> +       unsigned long *af_xdp_zc_qps;
>         struct ethtool_link_ksettings link_ksettings;
>         /* init args */
>         struct file *file;
> @@ -795,6 +802,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>
>         tfile->queue_index = tun->numqueues;
>         tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
> +       tfile->desc.len = 0;
> +       tfile->pool = NULL;
>
>         if (tfile->detached) {
>                 /* Re-attach detached tfile, updating XDP queue_index */
> @@ -989,6 +998,13 @@ static int tun_net_init(struct net_device *dev)
>                 return err;
>         }
>
> +       tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES, GFP_KERNEL);
> +       if (!tun->af_xdp_zc_qps) {
> +               security_tun_dev_free_security(tun->security);
> +               free_percpu(dev->tstats);
> +               return -ENOMEM;
> +       }
> +
>         tun_flow_init(tun);
>
>         dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> @@ -1009,6 +1025,7 @@ static int tun_net_init(struct net_device *dev)
>                 tun_flow_uninit(tun);
>                 security_tun_dev_free_security(tun->security);
>                 free_percpu(dev->tstats);
> +               bitmap_free(tun->af_xdp_zc_qps);
>                 return err;
>         }
>         return 0;
> @@ -1222,11 +1239,77 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>         return 0;
>  }
>
> +static int tun_xsk_pool_enable(struct net_device *netdev,
> +                              struct xsk_buff_pool *pool,
> +                              u16 qid)
> +{
> +       struct tun_struct *tun = netdev_priv(netdev);
> +       struct tun_file *tfile;
> +       unsigned long flags;
> +
> +       rcu_read_lock();
> +       tfile = rtnl_dereference(tun->tfiles[qid]);
> +       if (!tfile) {
> +               rcu_read_unlock();
> +               return -ENODEV;
> +       }
> +
> +       spin_lock_irqsave(&tfile->pool_lock, flags);
> +       xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> +       tfile->pool = pool;
> +       spin_unlock_irqrestore(&tfile->pool_lock, flags);
> +
> +       rcu_read_unlock();
> +       set_bit(qid, tun->af_xdp_zc_qps);
> +
> +       return 0;
> +}
> +
> +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
> +{
> +       struct tun_struct *tun = netdev_priv(netdev);
> +       struct tun_file *tfile;
> +       unsigned long flags;
> +
> +       if (!test_bit(qid, tun->af_xdp_zc_qps))
> +               return 0;
> +
> +       clear_bit(qid, tun->af_xdp_zc_qps);
> +
> +       rcu_read_lock();
> +       tfile = rtnl_dereference(tun->tfiles[qid]);
> +       if (!tfile) {
> +               rcu_read_unlock();
> +               return 0;
> +       }
> +
> +       spin_lock_irqsave(&tfile->pool_lock, flags);
> +       if (tfile->desc.len) {
> +               xsk_tx_completed(tfile->pool, 1);
> +               tfile->desc.len = 0;
> +       }
> +       tfile->pool = NULL;
> +       spin_unlock_irqrestore(&tfile->pool_lock, flags);
> +
> +       rcu_read_unlock();
> +       return 0;
> +}
> +
> +int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
> +                      u16 qid)
> +{
> +       return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> +               tun_xsk_pool_disable(dev, qid);
> +}
> +
>  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  {
>         switch (xdp->command) {
>         case XDP_SETUP_PROG:
>                 return tun_xdp_set(dev, xdp->prog, xdp->extack);
> +       case XDP_SETUP_XSK_POOL:
> +               return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> +                                          xdp->xsk.queue_id);
>         default:
>                 return -EINVAL;
>         }
> @@ -1331,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
>         return nxmit;
>  }
>
> +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> +{
> +       struct tun_struct *tun = netdev_priv(dev);
> +       struct tun_file *tfile;
> +
> +       rcu_read_lock();
> +       tfile = rcu_dereference(tun->tfiles[qid]);
> +       if (tfile)
> +               __tun_xdp_flush_tfile(tfile);
> +       rcu_read_unlock();
> +       return 0;
> +}
> +
>  static const struct net_device_ops tap_netdev_ops = {
>         .ndo_init               = tun_net_init,
>         .ndo_uninit             = tun_net_uninit,
> @@ -1347,6 +1443,7 @@ static const struct net_device_ops tap_netdev_ops = {
>         .ndo_get_stats64        = dev_get_tstats64,
>         .ndo_bpf                = tun_xdp,
>         .ndo_xdp_xmit           = tun_xdp_xmit,
> +       .ndo_xsk_wakeup         = tun_xsk_wakeup,
>         .ndo_change_carrier     = tun_net_change_carrier,
>  };
>
> @@ -1404,7 +1501,8 @@ static void tun_net_initialize(struct net_device *dev)
>                 /* Currently tun does not support XDP, only tap does. */
>                 dev->xdp_features = NETDEV_XDP_ACT_BASIC |
>                                     NETDEV_XDP_ACT_REDIRECT |
> -                                   NETDEV_XDP_ACT_NDO_XMIT;
> +                                   NETDEV_XDP_ACT_NDO_XMIT |
> +                                   NETDEV_XDP_ACT_XSK_ZEROCOPY;
>
>                 break;
>         }
> @@ -2213,6 +2311,37 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>         return ptr;
>  }
>
> +static ssize_t tun_put_user_desc(struct tun_struct *tun,
> +                                struct tun_file *tfile,
> +                                struct xdp_desc *desc,
> +                                struct iov_iter *iter)
> +{
> +       size_t size = desc->len;
> +       int vnet_hdr_sz = 0;
> +       size_t ret;
> +
> +       if (tun->flags & IFF_VNET_HDR) {
> +               struct virtio_net_hdr_mrg_rxbuf gso = { 0 };
> +
> +               vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
> +               if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
> +                       return -EINVAL;
> +               if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
> +                            sizeof(gso)))
> +                       return -EFAULT;
> +               iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
> +       }
> +
> +       ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr),
> +                          size, iter) + vnet_hdr_sz;
> +
> +       preempt_disable();
> +       dev_sw_netstats_tx_add(tun->dev, 1, ret);
> +       preempt_enable();
> +
> +       return ret;
> +}
> +
>  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>                            struct iov_iter *to,
>                            int noblock, void *ptr)
> @@ -2226,6 +2355,22 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>         }
>
>         if (!ptr) {
> +               /* Read frames from xsk's desc */
> +               if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> +                       spin_lock(&tfile->pool_lock);
> +                       if (tfile->pool) {
> +                               ret = tun_put_user_desc(tun, tfile, &tfile->desc, to);
> +                               xsk_tx_completed(tfile->pool, 1);
> +                               if (xsk_uses_need_wakeup(tfile->pool))
> +                                       xsk_set_tx_need_wakeup(tfile->pool);
> +                               tfile->desc.len = 0;
> +                       } else {
> +                               ret = -EBADFD;
> +                       }
> +                       spin_unlock(&tfile->pool_lock);
> +                       return ret;
> +               }
> +
>                 /* Read frames from ring */
>                 ptr = tun_ring_recv(tfile, noblock, &err);
>                 if (!ptr)
> @@ -2311,6 +2456,7 @@ static void tun_free_netdev(struct net_device *dev)
>
>         BUG_ON(!(list_empty(&tun->disabled)));
>
> +       bitmap_free(tun->af_xdp_zc_qps);
>         free_percpu(dev->tstats);
>         tun_flow_uninit(tun);
>         security_tun_dev_free_security(tun->security);
> @@ -2666,7 +2812,19 @@ static int tun_peek_len(struct socket *sock)
>         if (!tun)
>                 return 0;
>
> -       ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> +       if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> +               spin_lock(&tfile->pool_lock);
> +               if (tfile->pool && xsk_tx_peek_desc(tfile->pool, &tfile->desc)) {

Does it mean if userspace doesn't peek, we can't read anything?

We need to make sure syscall read works as well as vhost_net.

> +                       xsk_tx_release(tfile->pool);
> +                       ret = tfile->desc.len;
> +                       /* The length of desc must be greater than 0 */
> +                       if (!ret)
> +                               xsk_tx_completed(tfile->pool, 1);
> +               }
> +               spin_unlock(&tfile->pool_lock);
> +       } else {
> +               ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> +       }
>         tun_put(tun);
>
>         return ret;
> @@ -3469,8 +3627,11 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>
>         mutex_init(&tfile->napi_mutex);
>         RCU_INIT_POINTER(tfile->tun, NULL);
> +       spin_lock_init(&tfile->pool_lock);
>         tfile->flags = 0;
>         tfile->ifindex = 0;
> +       tfile->pool = NULL;
> +       tfile->desc.len = 0;
>
>         init_waitqueue_head(&tfile->socket.wq.wait);
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index f2ed7167c848..a1f143ad2341 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
>
>  static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
>  {
> -       void *ret = vhost_net_buf_get_ptr(rxq);
> -       ++rxq->head;
> -       return ret;
> +       if (rxq->tail == rxq->head)
> +               return NULL;
> +
> +       return rxq->queue[rxq->head++];
>  }
>
>  static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
> @@ -993,12 +994,19 @@ static void handle_tx(struct vhost_net *net)
>
>  static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
>  {
> +       struct socket *sock = sk->sk_socket;
>         struct sk_buff *head;
>         int len = 0;
>         unsigned long flags;
>
> -       if (rvq->rx_ring)
> -               return vhost_net_buf_peek(rvq);
> +       if (rvq->rx_ring) {
> +               len = vhost_net_buf_peek(rvq);
> +               if (likely(len))
> +                       return len;
> +       }
> +
> +       if (sock->ops->peek_len)
> +               return sock->ops->peek_len(sock);

What prevents you from reusing the ptr_ring here? Then you don't need
the above tricks.

Thanks


>
>         spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
>         head = skb_peek(&sk->sk_receive_queue);
> --
> 2.33.0
>
wangyunjian Jan. 25, 2024, 11:44 a.m. UTC | #4
> -----Original Message-----
> From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com]
> Sent: Thursday, January 25, 2024 3:05 AM
> To: wangyunjian <wangyunjian@huawei.com>; mst@redhat.com;
> willemdebruijn.kernel@gmail.com; jasowang@redhat.com; kuba@kernel.org;
> davem@davemloft.net; magnus.karlsson@intel.com
> Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> kvm@vger.kernel.org; virtualization@lists.linux.dev; xudingke
> <xudingke@huawei.com>; wangyunjian <wangyunjian@huawei.com>
> Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> 
> Yunjian Wang wrote:
> > Now the zero-copy feature of AF_XDP socket is supported by some
> > drivers, which can reduce CPU utilization on the xdp program.
> > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> >
> > This patch tries to address this by:
> > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> > So add a check for empty vq's array in vhost_net_buf_produce().
> > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > - add tun_put_user_desc function to copy the Rx data to VM
> >
> > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> 
> I don't fully understand the higher level design of this feature yet.

This feature can reduce the memory copy for the xdp program. 

> 
> But some initial comments at the code level.
> 
> > ---
> >  drivers/net/tun.c   | 165
> +++++++++++++++++++++++++++++++++++++++++++-
> >  drivers/vhost/net.c |  18 +++--
> >  2 files changed, 176 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c index
> > afa5497f7c35..248b0f8e07d1 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -77,6 +77,7 @@
> >  #include <net/ax25.h>
> >  #include <net/rose.h>
> >  #include <net/6lowpan.h>
> > +#include <net/xdp_sock_drv.h>
> >
> >  #include <linux/uaccess.h>
> >  #include <linux/proc_fs.h>
> > @@ -145,6 +146,10 @@ struct tun_file {
> >  	struct tun_struct *detached;
> >  	struct ptr_ring tx_ring;
> >  	struct xdp_rxq_info xdp_rxq;
> > +	struct xdp_desc desc;
> > +	/* protects xsk pool */
> > +	spinlock_t pool_lock;
> > +	struct xsk_buff_pool *pool;
> >  };
> >
> >  struct tun_page {
> > @@ -208,6 +213,8 @@ struct tun_struct {
> >  	struct bpf_prog __rcu *xdp_prog;
> >  	struct tun_prog __rcu *steering_prog;
> >  	struct tun_prog __rcu *filter_prog;
> > +	/* tracks AF_XDP ZC enabled queues */
> > +	unsigned long *af_xdp_zc_qps;
> >  	struct ethtool_link_ksettings link_ksettings;
> >  	/* init args */
> >  	struct file *file;
> > @@ -795,6 +802,8 @@ static int tun_attach(struct tun_struct *tun,
> > struct file *file,
> >
> >  	tfile->queue_index = tun->numqueues;
> >  	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
> > +	tfile->desc.len = 0;
> > +	tfile->pool = NULL;
> >
> >  	if (tfile->detached) {
> >  		/* Re-attach detached tfile, updating XDP queue_index */ @@ -989,6
> > +998,13 @@ static int tun_net_init(struct net_device *dev)
> >  		return err;
> >  	}
> >
> > +	tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES, GFP_KERNEL);
> > +	if (!tun->af_xdp_zc_qps) {
> > +		security_tun_dev_free_security(tun->security);
> > +		free_percpu(dev->tstats);
> > +		return -ENOMEM;
> > +	}
> > +
> >  	tun_flow_init(tun);
> >
> >  	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | @@ -1009,6
> > +1025,7 @@ static int tun_net_init(struct net_device *dev)
> >  		tun_flow_uninit(tun);
> >  		security_tun_dev_free_security(tun->security);
> >  		free_percpu(dev->tstats);
> > +		bitmap_free(tun->af_xdp_zc_qps);
> 
> Please release state in inverse order of acquire.

Will done.

> 
> >  		return err;
> >  	}
> >  	return 0;
> > @@ -1222,11 +1239,77 @@ static int tun_xdp_set(struct net_device *dev,
> struct bpf_prog *prog,
> >  	return 0;
> >  }
> >
> > +static int tun_xsk_pool_enable(struct net_device *netdev,
> > +			       struct xsk_buff_pool *pool,
> > +			       u16 qid)
> > +{
> > +	struct tun_struct *tun = netdev_priv(netdev);
> > +	struct tun_file *tfile;
> > +	unsigned long flags;
> > +
> > +	rcu_read_lock();
> > +	tfile = rtnl_dereference(tun->tfiles[qid]);
> > +	if (!tfile) {
> > +		rcu_read_unlock();
> > +		return -ENODEV;
> > +	}
> 
> No need for rcu_read_lock with rtnl_dereference.
> 
> Consider ASSERT_RTNL() if unsure whether this patch could be reached without
> the rtnl held.

Will done.

> 
> > +
> > +	spin_lock_irqsave(&tfile->pool_lock, flags);
> > +	xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> > +	tfile->pool = pool;
> > +	spin_unlock_irqrestore(&tfile->pool_lock, flags);
> > +
> > +	rcu_read_unlock();
> > +	set_bit(qid, tun->af_xdp_zc_qps);
> 
> What are the concurrency semantics: there's a spinlock to make the update to
> xdp_rxq and pool a critical section, but the bitmap is not part of this? Please
> also then document why the irqsave.

The bitmap indicates whether the XDP pool is enabled on the current queue,
reducing the impact of the non-enabled XDP pool on the original scenario.
The XDP program may release the pool when receiving packets. So the lock is
only used to protect the pool.

> 
> > +
> > +	return 0;
> > +}
> > +
> > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) {
> > +	struct tun_struct *tun = netdev_priv(netdev);
> > +	struct tun_file *tfile;
> > +	unsigned long flags;
> > +
> > +	if (!test_bit(qid, tun->af_xdp_zc_qps))
> > +		return 0;
> > +
> > +	clear_bit(qid, tun->af_xdp_zc_qps);
> 
> Time of check to time of use race between test and clear? Or is there no race
> because anything that clears will hold the RTNL? If so, please add a comment.
> 
> > +
> > +	rcu_read_lock();
> > +	tfile = rtnl_dereference(tun->tfiles[qid]);
> > +	if (!tfile) {
> > +		rcu_read_unlock();
> > +		return 0;
> > +	}
> > +
> > +	spin_lock_irqsave(&tfile->pool_lock, flags);
> > +	if (tfile->desc.len) {
> > +		xsk_tx_completed(tfile->pool, 1);
> > +		tfile->desc.len = 0;
> > +	}
> > +	tfile->pool = NULL;
> > +	spin_unlock_irqrestore(&tfile->pool_lock, flags);
> > +
> > +	rcu_read_unlock();
> > +	return 0;
> > +}
> > +
> > +int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
> > +		       u16 qid)
> > +{
> > +	return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> > +		tun_xsk_pool_disable(dev, qid);
> > +}
> > +
> >  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)  {
> >  	switch (xdp->command) {
> >  	case XDP_SETUP_PROG:
> >  		return tun_xdp_set(dev, xdp->prog, xdp->extack);
> > +	case XDP_SETUP_XSK_POOL:
> > +		return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> > +					   xdp->xsk.queue_id);
> >  	default:
> >  		return -EINVAL;
> >  	}
> > @@ -1331,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev,
> struct xdp_buff *xdp)
> >  	return nxmit;
> >  }
> >
> > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> > +{
> > +	struct tun_struct *tun = netdev_priv(dev);
> > +	struct tun_file *tfile;
> > +
> > +	rcu_read_lock();
> > +	tfile = rcu_dereference(tun->tfiles[qid]);
> > +	if (tfile)
> > +		__tun_xdp_flush_tfile(tfile);
> > +	rcu_read_unlock();
> > +	return 0;
> > +}
> > +
> >  static const struct net_device_ops tap_netdev_ops = {
> >  	.ndo_init		= tun_net_init,
> >  	.ndo_uninit		= tun_net_uninit,
> > @@ -1347,6 +1443,7 @@ static const struct net_device_ops
> tap_netdev_ops = {
> >  	.ndo_get_stats64	= dev_get_tstats64,
> >  	.ndo_bpf		= tun_xdp,
> >  	.ndo_xdp_xmit		= tun_xdp_xmit,
> > +	.ndo_xsk_wakeup		= tun_xsk_wakeup,
> >  	.ndo_change_carrier	= tun_net_change_carrier,
> >  };
> >
> > @@ -1404,7 +1501,8 @@ static void tun_net_initialize(struct net_device
> *dev)
> >  		/* Currently tun does not support XDP, only tap does. */
> >  		dev->xdp_features = NETDEV_XDP_ACT_BASIC |
> >  				    NETDEV_XDP_ACT_REDIRECT |
> > -				    NETDEV_XDP_ACT_NDO_XMIT;
> > +				    NETDEV_XDP_ACT_NDO_XMIT |
> > +				    NETDEV_XDP_ACT_XSK_ZEROCOPY;
> >
> >  		break;
> >  	}
> > @@ -2213,6 +2311,37 @@ static void *tun_ring_recv(struct tun_file *tfile,
> int noblock, int *err)
> >  	return ptr;
> >  }
> >
> > +static ssize_t tun_put_user_desc(struct tun_struct *tun,
> > +				 struct tun_file *tfile,
> > +				 struct xdp_desc *desc,
> > +				 struct iov_iter *iter)
> > +{
> > +	size_t size = desc->len;
> > +	int vnet_hdr_sz = 0;
> > +	size_t ret;
> > +
> > +	if (tun->flags & IFF_VNET_HDR) {
> > +		struct virtio_net_hdr_mrg_rxbuf gso = { 0 };
> > +
> > +		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
> > +		if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
> > +			return -EINVAL;
> > +		if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
> > +			     sizeof(gso)))
> > +			return -EFAULT;
> > +		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
> > +	}
> > +
> > +	ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr),
> > +			   size, iter) + vnet_hdr_sz;
> > +
> > +	preempt_disable();
> > +	dev_sw_netstats_tx_add(tun->dev, 1, ret);
> > +	preempt_enable();
> > +
> > +	return ret;
> > +}
> > +
> 
> This is almost a copy of tun_put_user_xdp. Can we refactor to avoid duplicating
> code (here and elsewhere this may apply).

Will done.

> 
> >  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
> >  			   struct iov_iter *to,
> >  			   int noblock, void *ptr)
> > @@ -2226,6 +2355,22 @@ static ssize_t tun_do_read(struct tun_struct
> *tun, struct tun_file *tfile,
> >  	}
> >
> >  	if (!ptr) {
> > +		/* Read frames from xsk's desc */
> > +		if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> > +			spin_lock(&tfile->pool_lock);
> > +			if (tfile->pool) {
> > +				ret = tun_put_user_desc(tun, tfile, &tfile->desc, to);
> > +				xsk_tx_completed(tfile->pool, 1);
> > +				if (xsk_uses_need_wakeup(tfile->pool))
> > +					xsk_set_tx_need_wakeup(tfile->pool);
> 
> For the xsk maintainers if they're reading along: this two line pattern is seen
> quite often. Deserves a helper in a header file?
> 
> > +				tfile->desc.len = 0;
> > +			} else {
> > +				ret = -EBADFD;
> > +			}
> > +			spin_unlock(&tfile->pool_lock);
> > +			return ret;
> > +		}
> > +
> >  		/* Read frames from ring */
> >  		ptr = tun_ring_recv(tfile, noblock, &err);
> >  		if (!ptr)
> > @@ -2311,6 +2456,7 @@ static void tun_free_netdev(struct net_device
> > *dev)
> >
> >  	BUG_ON(!(list_empty(&tun->disabled)));
> >
> > +	bitmap_free(tun->af_xdp_zc_qps);
> >  	free_percpu(dev->tstats);
> >  	tun_flow_uninit(tun);
> >  	security_tun_dev_free_security(tun->security);
> > @@ -2666,7 +2812,19 @@ static int tun_peek_len(struct socket *sock)
> >  	if (!tun)
> >  		return 0;
> >
> > -	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> > +	if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> > +		spin_lock(&tfile->pool_lock);
> > +		if (tfile->pool && xsk_tx_peek_desc(tfile->pool, &tfile->desc)) {
> > +			xsk_tx_release(tfile->pool);
> > +			ret = tfile->desc.len;
> > +			/* The length of desc must be greater than 0 */
> > +			if (!ret)
> > +				xsk_tx_completed(tfile->pool, 1);
> 
> Peek semantics usually don't result in releasing the buffer. Am I
> misunderstanding this operation?
> > +		}
> > +		spin_unlock(&tfile->pool_lock);
> > +	} else {
> > +		ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> > +	}
> >  	tun_put(tun);
> >
> >  	return ret;
> > @@ -3469,8 +3627,11 @@ static int tun_chr_open(struct inode *inode,
> > struct file * file)
> >
> >  	mutex_init(&tfile->napi_mutex);
> >  	RCU_INIT_POINTER(tfile->tun, NULL);
> > +	spin_lock_init(&tfile->pool_lock);
> >  	tfile->flags = 0;
> >  	tfile->ifindex = 0;
> > +	tfile->pool = NULL;
> > +	tfile->desc.len = 0;
> >
> >  	init_waitqueue_head(&tfile->socket.wq.wait);
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index
> > f2ed7167c848..a1f143ad2341 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> 
> For virtio maintainer: is it okay to have tun and vhost/net changes in the same
> patch, or is it better to split them?
> 
> > @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct
> > vhost_net_buf *rxq)
> >
> >  static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)  {
> > -	void *ret = vhost_net_buf_get_ptr(rxq);
> > -	++rxq->head;
> > -	return ret;
> > +	if (rxq->tail == rxq->head)
> > +		return NULL;
> > +
> > +	return rxq->queue[rxq->head++];
> 
> Why this change?
> 
> >  }
> >
> >  static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) @@
> > -993,12 +994,19 @@ static void handle_tx(struct vhost_net *net)
> >
> >  static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock
> > *sk)  {
> > +	struct socket *sock = sk->sk_socket;
> >  	struct sk_buff *head;
> >  	int len = 0;
> >  	unsigned long flags;
> >
> > -	if (rvq->rx_ring)
> > -		return vhost_net_buf_peek(rvq);
> > +	if (rvq->rx_ring) {
> > +		len = vhost_net_buf_peek(rvq);
> > +		if (likely(len))
> > +			return len;
> > +	}
> > +
> > +	if (sock->ops->peek_len)
> > +		return sock->ops->peek_len(sock);
> >
> >  	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
> >  	head = skb_peek(&sk->sk_receive_queue);
> > --
> > 2.33.0
> >
> 

Thanks
wangyunjian Jan. 25, 2024, 12:54 p.m. UTC | #5
> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Thursday, January 25, 2024 12:49 PM
> To: wangyunjian <wangyunjian@huawei.com>
> Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> davem@davemloft.net; magnus.karlsson@intel.com; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>
> Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> 
> On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang <wangyunjian@huawei.com>
> wrote:
> >
> > Now the zero-copy feature of AF_XDP socket is supported by some
> > drivers, which can reduce CPU utilization on the xdp program.
> > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> >
> > This patch tries to address this by:
> > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> > So add a check for empty vq's array in vhost_net_buf_produce().
> > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > - add tun_put_user_desc function to copy the Rx data to VM
> 
> Code explains themselves, let's explain why you need to do this.
> 
> 1) why you want to use peek_len
> 2) for "vq's array", what does it mean?
> 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so I guess you
> meant TX zerocopy instead of RX (as I don't see codes for
> RX?)

OK, I agree and use TX zerocopy instead of RX zerocopy. I meant RX zerocopy
from the view of vhost-net. 

> 
> A big question is how could you handle GSO packets from userspace/guests?

Now by disabling VM's TSO and csum feature. XDP does not support GSO packets.
However, this feature can be added once XDP supports it in the future. 

> 
> >
> > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> > ---
> >  drivers/net/tun.c   | 165
> +++++++++++++++++++++++++++++++++++++++++++-
> >  drivers/vhost/net.c |  18 +++--
> >  2 files changed, 176 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c index
> > afa5497f7c35..248b0f8e07d1 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -77,6 +77,7 @@
> >  #include <net/ax25.h>
> >  #include <net/rose.h>
> >  #include <net/6lowpan.h>
> > +#include <net/xdp_sock_drv.h>
> >
> >  #include <linux/uaccess.h>
> >  #include <linux/proc_fs.h>
> > @@ -145,6 +146,10 @@ struct tun_file {
> >         struct tun_struct *detached;
> >         struct ptr_ring tx_ring;
> >         struct xdp_rxq_info xdp_rxq;
> > +       struct xdp_desc desc;
> > +       /* protects xsk pool */
> > +       spinlock_t pool_lock;
> > +       struct xsk_buff_pool *pool;
> >  };
> >
> >  struct tun_page {
> > @@ -208,6 +213,8 @@ struct tun_struct {
> >         struct bpf_prog __rcu *xdp_prog;
> >         struct tun_prog __rcu *steering_prog;
> >         struct tun_prog __rcu *filter_prog;
> > +       /* tracks AF_XDP ZC enabled queues */
> > +       unsigned long *af_xdp_zc_qps;
> >         struct ethtool_link_ksettings link_ksettings;
> >         /* init args */
> >         struct file *file;
> > @@ -795,6 +802,8 @@ static int tun_attach(struct tun_struct *tun,
> > struct file *file,
> >
> >         tfile->queue_index = tun->numqueues;
> >         tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
> > +       tfile->desc.len = 0;
> > +       tfile->pool = NULL;
> >
> >         if (tfile->detached) {
> >                 /* Re-attach detached tfile, updating XDP queue_index
> > */ @@ -989,6 +998,13 @@ static int tun_net_init(struct net_device *dev)
> >                 return err;
> >         }
> >
> > +       tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES,
> GFP_KERNEL);
> > +       if (!tun->af_xdp_zc_qps) {
> > +               security_tun_dev_free_security(tun->security);
> > +               free_percpu(dev->tstats);
> > +               return -ENOMEM;
> > +       }
> > +
> >         tun_flow_init(tun);
> >
> >         dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | @@
> -1009,6
> > +1025,7 @@ static int tun_net_init(struct net_device *dev)
> >                 tun_flow_uninit(tun);
> >                 security_tun_dev_free_security(tun->security);
> >                 free_percpu(dev->tstats);
> > +               bitmap_free(tun->af_xdp_zc_qps);
> >                 return err;
> >         }
> >         return 0;
> > @@ -1222,11 +1239,77 @@ static int tun_xdp_set(struct net_device *dev,
> struct bpf_prog *prog,
> >         return 0;
> >  }
> >
> > +static int tun_xsk_pool_enable(struct net_device *netdev,
> > +                              struct xsk_buff_pool *pool,
> > +                              u16 qid) {
> > +       struct tun_struct *tun = netdev_priv(netdev);
> > +       struct tun_file *tfile;
> > +       unsigned long flags;
> > +
> > +       rcu_read_lock();
> > +       tfile = rtnl_dereference(tun->tfiles[qid]);
> > +       if (!tfile) {
> > +               rcu_read_unlock();
> > +               return -ENODEV;
> > +       }
> > +
> > +       spin_lock_irqsave(&tfile->pool_lock, flags);
> > +       xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> > +       tfile->pool = pool;
> > +       spin_unlock_irqrestore(&tfile->pool_lock, flags);
> > +
> > +       rcu_read_unlock();
> > +       set_bit(qid, tun->af_xdp_zc_qps);
> > +
> > +       return 0;
> > +}
> > +
> > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) {
> > +       struct tun_struct *tun = netdev_priv(netdev);
> > +       struct tun_file *tfile;
> > +       unsigned long flags;
> > +
> > +       if (!test_bit(qid, tun->af_xdp_zc_qps))
> > +               return 0;
> > +
> > +       clear_bit(qid, tun->af_xdp_zc_qps);
> > +
> > +       rcu_read_lock();
> > +       tfile = rtnl_dereference(tun->tfiles[qid]);
> > +       if (!tfile) {
> > +               rcu_read_unlock();
> > +               return 0;
> > +       }
> > +
> > +       spin_lock_irqsave(&tfile->pool_lock, flags);
> > +       if (tfile->desc.len) {
> > +               xsk_tx_completed(tfile->pool, 1);
> > +               tfile->desc.len = 0;
> > +       }
> > +       tfile->pool = NULL;
> > +       spin_unlock_irqrestore(&tfile->pool_lock, flags);
> > +
> > +       rcu_read_unlock();
> > +       return 0;
> > +}
> > +
> > +int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
> > +                      u16 qid)
> > +{
> > +       return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> > +               tun_xsk_pool_disable(dev, qid); }
> > +
> >  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)  {
> >         switch (xdp->command) {
> >         case XDP_SETUP_PROG:
> >                 return tun_xdp_set(dev, xdp->prog, xdp->extack);
> > +       case XDP_SETUP_XSK_POOL:
> > +               return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> > +                                          xdp->xsk.queue_id);
> >         default:
> >                 return -EINVAL;
> >         }
> > @@ -1331,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev,
> struct xdp_buff *xdp)
> >         return nxmit;
> >  }
> >
> > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> > +{
> > +       struct tun_struct *tun = netdev_priv(dev);
> > +       struct tun_file *tfile;
> > +
> > +       rcu_read_lock();
> > +       tfile = rcu_dereference(tun->tfiles[qid]);
> > +       if (tfile)
> > +               __tun_xdp_flush_tfile(tfile);
> > +       rcu_read_unlock();
> > +       return 0;
> > +}
> > +
> >  static const struct net_device_ops tap_netdev_ops = {
> >         .ndo_init               = tun_net_init,
> >         .ndo_uninit             = tun_net_uninit,
> > @@ -1347,6 +1443,7 @@ static const struct net_device_ops
> tap_netdev_ops = {
> >         .ndo_get_stats64        = dev_get_tstats64,
> >         .ndo_bpf                = tun_xdp,
> >         .ndo_xdp_xmit           = tun_xdp_xmit,
> > +       .ndo_xsk_wakeup         = tun_xsk_wakeup,
> >         .ndo_change_carrier     = tun_net_change_carrier,
> >  };
> >
> > @@ -1404,7 +1501,8 @@ static void tun_net_initialize(struct net_device
> *dev)
> >                 /* Currently tun does not support XDP, only tap does. */
> >                 dev->xdp_features = NETDEV_XDP_ACT_BASIC |
> >                                     NETDEV_XDP_ACT_REDIRECT |
> > -                                   NETDEV_XDP_ACT_NDO_XMIT;
> > +                                   NETDEV_XDP_ACT_NDO_XMIT |
> > +
> NETDEV_XDP_ACT_XSK_ZEROCOPY;
> >
> >                 break;
> >         }
> > @@ -2213,6 +2311,37 @@ static void *tun_ring_recv(struct tun_file *tfile,
> int noblock, int *err)
> >         return ptr;
> >  }
> >
> > +static ssize_t tun_put_user_desc(struct tun_struct *tun,
> > +                                struct tun_file *tfile,
> > +                                struct xdp_desc *desc,
> > +                                struct iov_iter *iter) {
> > +       size_t size = desc->len;
> > +       int vnet_hdr_sz = 0;
> > +       size_t ret;
> > +
> > +       if (tun->flags & IFF_VNET_HDR) {
> > +               struct virtio_net_hdr_mrg_rxbuf gso = { 0 };
> > +
> > +               vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
> > +               if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
> > +                       return -EINVAL;
> > +               if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
> > +                            sizeof(gso)))
> > +                       return -EFAULT;
> > +               iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
> > +       }
> > +
> > +       ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr),
> > +                          size, iter) + vnet_hdr_sz;
> > +
> > +       preempt_disable();
> > +       dev_sw_netstats_tx_add(tun->dev, 1, ret);
> > +       preempt_enable();
> > +
> > +       return ret;
> > +}
> > +
> >  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
> >                            struct iov_iter *to,
> >                            int noblock, void *ptr) @@ -2226,6
> +2355,22
> > @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
> >         }
> >
> >         if (!ptr) {
> > +               /* Read frames from xsk's desc */
> > +               if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> > +                       spin_lock(&tfile->pool_lock);
> > +                       if (tfile->pool) {
> > +                               ret = tun_put_user_desc(tun, tfile,
> &tfile->desc, to);
> > +                               xsk_tx_completed(tfile->pool, 1);
> > +                               if
> (xsk_uses_need_wakeup(tfile->pool))
> > +
> xsk_set_tx_need_wakeup(tfile->pool);
> > +                               tfile->desc.len = 0;
> > +                       } else {
> > +                               ret = -EBADFD;
> > +                       }
> > +                       spin_unlock(&tfile->pool_lock);
> > +                       return ret;
> > +               }
> > +
> >                 /* Read frames from ring */
> >                 ptr = tun_ring_recv(tfile, noblock, &err);
> >                 if (!ptr)
> > @@ -2311,6 +2456,7 @@ static void tun_free_netdev(struct net_device
> > *dev)
> >
> >         BUG_ON(!(list_empty(&tun->disabled)));
> >
> > +       bitmap_free(tun->af_xdp_zc_qps);
> >         free_percpu(dev->tstats);
> >         tun_flow_uninit(tun);
> >         security_tun_dev_free_security(tun->security);
> > @@ -2666,7 +2812,19 @@ static int tun_peek_len(struct socket *sock)
> >         if (!tun)
> >                 return 0;
> >
> > -       ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> > +       if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> > +               spin_lock(&tfile->pool_lock);
> > +               if (tfile->pool && xsk_tx_peek_desc(tfile->pool,
> > + &tfile->desc)) {
> 
> Does it mean if userspace doesn't peek, we can't read anything?

The peek can also be implemented into tun_xsk_wakeup() when picking XDP program context

> 
> We need to make sure syscall read works as well as vhost_net.
> 
> > +                       xsk_tx_release(tfile->pool);
> > +                       ret = tfile->desc.len;
> > +                       /* The length of desc must be greater than 0 */
> > +                       if (!ret)
> > +                               xsk_tx_completed(tfile->pool, 1);
> > +               }
> > +               spin_unlock(&tfile->pool_lock);
> > +       } else {
> > +               ret = PTR_RING_PEEK_CALL(&tfile->tx_ring,
> tun_ptr_peek_len);
> > +       }
> >         tun_put(tun);
> >
> >         return ret;
> > @@ -3469,8 +3627,11 @@ static int tun_chr_open(struct inode *inode,
> > struct file * file)
> >
> >         mutex_init(&tfile->napi_mutex);
> >         RCU_INIT_POINTER(tfile->tun, NULL);
> > +       spin_lock_init(&tfile->pool_lock);
> >         tfile->flags = 0;
> >         tfile->ifindex = 0;
> > +       tfile->pool = NULL;
> > +       tfile->desc.len = 0;
> >
> >         init_waitqueue_head(&tfile->socket.wq.wait);
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index
> > f2ed7167c848..a1f143ad2341 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct
> > vhost_net_buf *rxq)
> >
> >  static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)  {
> > -       void *ret = vhost_net_buf_get_ptr(rxq);
> > -       ++rxq->head;
> > -       return ret;
> > +       if (rxq->tail == rxq->head)
> > +               return NULL;
> > +
> > +       return rxq->queue[rxq->head++];
> >  }
> >
> >  static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) @@
> > -993,12 +994,19 @@ static void handle_tx(struct vhost_net *net)
> >
> >  static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock
> > *sk)  {
> > +       struct socket *sock = sk->sk_socket;
> >         struct sk_buff *head;
> >         int len = 0;
> >         unsigned long flags;
> >
> > -       if (rvq->rx_ring)
> > -               return vhost_net_buf_peek(rvq);
> > +       if (rvq->rx_ring) {
> > +               len = vhost_net_buf_peek(rvq);
> > +               if (likely(len))
> > +                       return len;
> > +       }
> > +
> > +       if (sock->ops->peek_len)
> > +               return sock->ops->peek_len(sock);
> 
> What prevents you from reusing the ptr_ring here? Then you don't need the
> above tricks.

Thank you for your suggestion. I will consider how to reuse the ptr_ring.

> 
> Thanks
> 
> 
> >
> >         spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
> >         head = skb_peek(&sk->sk_receive_queue);
> > --
> > 2.33.0
> >
wangyunjian Jan. 27, 2024, 9:34 a.m. UTC | #6
> > -----Original Message-----
> > From: Jason Wang [mailto:jasowang@redhat.com]
> > Sent: Thursday, January 25, 2024 12:49 PM
> > To: wangyunjian <wangyunjian@huawei.com>
> > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> > davem@davemloft.net; magnus.karlsson@intel.com;
> > netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> > kvm@vger.kernel.org; virtualization@lists.linux.dev; xudingke
> > <xudingke@huawei.com>
> > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> >
> > On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang
> <wangyunjian@huawei.com>
> > wrote:
> > >
> > > Now the zero-copy feature of AF_XDP socket is supported by some
> > > drivers, which can reduce CPU utilization on the xdp program.
> > > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> > >
> > > This patch tries to address this by:
> > > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> > > So add a check for empty vq's array in vhost_net_buf_produce().
> > > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > > - add tun_put_user_desc function to copy the Rx data to VM
> >
> > Code explains themselves, let's explain why you need to do this.
> >
> > 1) why you want to use peek_len
> > 2) for "vq's array", what does it mean?
> > 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so I
> > guess you meant TX zerocopy instead of RX (as I don't see codes for
> > RX?)
> 
> OK, I agree and use TX zerocopy instead of RX zerocopy. I meant RX zerocopy
> from the view of vhost-net.
> 
> >
> > A big question is how could you handle GSO packets from userspace/guests?
> 
> Now by disabling VM's TSO and csum feature. XDP does not support GSO
> packets.
> However, this feature can be added once XDP supports it in the future.
> 
> >
> > >
> > > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> > > ---
> > >  drivers/net/tun.c   | 165
> > +++++++++++++++++++++++++++++++++++++++++++-
> > >  drivers/vhost/net.c |  18 +++--
> > >  2 files changed, 176 insertions(+), 7 deletions(-)

[...]

> > >
> > >  static int peek_head_len(struct vhost_net_virtqueue *rvq, struct
> > > sock
> > > *sk)  {
> > > +       struct socket *sock = sk->sk_socket;
> > >         struct sk_buff *head;
> > >         int len = 0;
> > >         unsigned long flags;
> > >
> > > -       if (rvq->rx_ring)
> > > -               return vhost_net_buf_peek(rvq);
> > > +       if (rvq->rx_ring) {
> > > +               len = vhost_net_buf_peek(rvq);
> > > +               if (likely(len))
> > > +                       return len;
> > > +       }
> > > +
> > > +       if (sock->ops->peek_len)
> > > +               return sock->ops->peek_len(sock);
> >
> > What prevents you from reusing the ptr_ring here? Then you don't need
> > the above tricks.
> 
> Thank you for your suggestion. I will consider how to reuse the ptr_ring.

If ptr_ring is used to transfer xdp_descs, there is a problem: After some
xdp_descs are obtained through xsk_tx_peek_desc(), the descs may fail
to be added to ptr_ring. However, no API is available to implement the
rollback function.

Thanks

> 
> >
> > Thanks
> >
> >
> > >
> > >         spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
> > >         head = skb_peek(&sk->sk_receive_queue);
> > > --
> > > 2.33.0
> > >
kernel test robot Jan. 27, 2024, 11:51 a.m. UTC | #7
Hi Yunjian,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Yunjian-Wang/xsk-Remove-non-zero-dma_page-check-in-xp_assign_dev/20240124-174011
base:   net-next/main
patch link:    https://lore.kernel.org/r/1706089075-16084-1-git-send-email-wangyunjian%40huawei.com
patch subject: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
config: um-allmodconfig (https://download.01.org/0day-ci/archive/20240127/202401271943.bs1GPeEU-lkp@intel.com/config)
compiler: clang version 18.0.0git (https://github.com/llvm/llvm-project a31a60074717fc40887cfe132b77eec93bedd307)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240127/202401271943.bs1GPeEU-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401271943.bs1GPeEU-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from drivers/net/tun.c:44:
   In file included from include/linux/skbuff.h:17:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     547 |         val = __raw_readb(PCI_IOBASE + addr);
         |                           ~~~~~~~~~~ ^
   include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     560 |         val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
         |                                                         ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
      37 | #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
         |                                                   ^
   In file included from drivers/net/tun.c:44:
   In file included from include/linux/skbuff.h:17:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     573 |         val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
         |                                                         ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
      35 | #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
         |                                                   ^
   In file included from drivers/net/tun.c:44:
   In file included from include/linux/skbuff.h:17:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     584 |         __raw_writeb(value, PCI_IOBASE + addr);
         |                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     594 |         __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
         |                                                       ~~~~~~~~~~ ^
   include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     604 |         __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
         |                                                       ~~~~~~~~~~ ^
   include/asm-generic/io.h:692:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     692 |         readsb(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:700:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     700 |         readsw(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:708:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     708 |         readsl(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:717:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     717 |         writesb(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   include/asm-generic/io.h:726:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     726 |         writesw(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   include/asm-generic/io.h:735:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     735 |         writesl(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
>> drivers/net/tun.c:1298:5: warning: no previous prototype for function 'tun_xsk_pool_setup' [-Wmissing-prototypes]
    1298 | int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
         |     ^
   drivers/net/tun.c:1298:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
    1298 | int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
         | ^
         | static 
   13 warnings generated.


vim +/tun_xsk_pool_setup +1298 drivers/net/tun.c

  1297	
> 1298	int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
  1299			       u16 qid)
  1300	{
  1301		return pool ? tun_xsk_pool_enable(dev, pool, qid) :
  1302			tun_xsk_pool_disable(dev, qid);
  1303	}
  1304
kernel test robot Jan. 28, 2024, 9:05 a.m. UTC | #8
Hi Yunjian,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Yunjian-Wang/xsk-Remove-non-zero-dma_page-check-in-xp_assign_dev/20240124-174011
base:   net-next/main
patch link:    https://lore.kernel.org/r/1706089075-16084-1-git-send-email-wangyunjian%40huawei.com
patch subject: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
config: s390-defconfig (https://download.01.org/0day-ci/archive/20240128/202401281639.yBaJZ4Sh-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240128/202401281639.yBaJZ4Sh-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401281639.yBaJZ4Sh-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/net/tun.c:1298:5: warning: no previous prototype for 'tun_xsk_pool_setup' [-Wmissing-prototypes]
    1298 | int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
         |     ^~~~~~~~~~~~~~~~~~


vim +/tun_xsk_pool_setup +1298 drivers/net/tun.c

  1297	
> 1298	int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
  1299			       u16 qid)
  1300	{
  1301		return pool ? tun_xsk_pool_enable(dev, pool, qid) :
  1302			tun_xsk_pool_disable(dev, qid);
  1303	}
  1304
kernel test robot Jan. 28, 2024, 9:39 a.m. UTC | #9
Hi Yunjian,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Yunjian-Wang/xsk-Remove-non-zero-dma_page-check-in-xp_assign_dev/20240124-174011
base:   net-next/main
patch link:    https://lore.kernel.org/r/1706089075-16084-1-git-send-email-wangyunjian%40huawei.com
patch subject: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
config: i386-randconfig-062-20240127 (https://download.01.org/0day-ci/archive/20240128/202401281735.bX1dign9-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240128/202401281735.bX1dign9-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401281735.bX1dign9-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> drivers/net/tun.c:1298:5: sparse: sparse: symbol 'tun_xsk_pool_setup' was not declared. Should it be static?
   drivers/net/tun.c: note: in included file (through include/linux/mmzone.h, include/linux/gfp.h, include/linux/umh.h, include/linux/kmod.h, ...):
   include/linux/page-flags.h:242:46: sparse: sparse: self-comparison always evaluates to false

vim +/tun_xsk_pool_setup +1298 drivers/net/tun.c

  1297	
> 1298	int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
  1299			       u16 qid)
  1300	{
  1301		return pool ? tun_xsk_pool_enable(dev, pool, qid) :
  1302			tun_xsk_pool_disable(dev, qid);
  1303	}
  1304
Jason Wang Jan. 29, 2024, 3:03 a.m. UTC | #10
On Thu, Jan 25, 2024 at 8:54 PM wangyunjian <wangyunjian@huawei.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Jason Wang [mailto:jasowang@redhat.com]
> > Sent: Thursday, January 25, 2024 12:49 PM
> > To: wangyunjian <wangyunjian@huawei.com>
> > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> > davem@davemloft.net; magnus.karlsson@intel.com; netdev@vger.kernel.org;
> > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>
> > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> >
> > On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang <wangyunjian@huawei.com>
> > wrote:
> > >
> > > Now the zero-copy feature of AF_XDP socket is supported by some
> > > drivers, which can reduce CPU utilization on the xdp program.
> > > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> > >
> > > This patch tries to address this by:
> > > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> > > So add a check for empty vq's array in vhost_net_buf_produce().
> > > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > > - add tun_put_user_desc function to copy the Rx data to VM
> >
> > Code explains themselves, let's explain why you need to do this.
> >
> > 1) why you want to use peek_len
> > 2) for "vq's array", what does it mean?
> > 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so I guess you
> > meant TX zerocopy instead of RX (as I don't see codes for
> > RX?)
>
> OK, I agree and use TX zerocopy instead of RX zerocopy. I meant RX zerocopy
> from the view of vhost-net.

Ok.

>
> >
> > A big question is how could you handle GSO packets from userspace/guests?
>
> Now by disabling VM's TSO and csum feature.

Btw, how could you do that?

Thanks
Jason Wang Jan. 29, 2024, 3:05 a.m. UTC | #11
On Sat, Jan 27, 2024 at 5:34 PM wangyunjian <wangyunjian@huawei.com> wrote:
>
> > > -----Original Message-----
> > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > Sent: Thursday, January 25, 2024 12:49 PM
> > > To: wangyunjian <wangyunjian@huawei.com>
> > > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> > > davem@davemloft.net; magnus.karlsson@intel.com;
> > > netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> > > kvm@vger.kernel.org; virtualization@lists.linux.dev; xudingke
> > > <xudingke@huawei.com>
> > > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> > >
> > > On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang
> > <wangyunjian@huawei.com>
> > > wrote:
> > > >
> > > > Now the zero-copy feature of AF_XDP socket is supported by some
> > > > drivers, which can reduce CPU utilization on the xdp program.
> > > > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> > > >
> > > > This patch tries to address this by:
> > > > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > > > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> > > > So add a check for empty vq's array in vhost_net_buf_produce().
> > > > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > > > - add tun_put_user_desc function to copy the Rx data to VM
> > >
> > > Code explains themselves, let's explain why you need to do this.
> > >
> > > 1) why you want to use peek_len
> > > 2) for "vq's array", what does it mean?
> > > 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so I
> > > guess you meant TX zerocopy instead of RX (as I don't see codes for
> > > RX?)
> >
> > OK, I agree and use TX zerocopy instead of RX zerocopy. I meant RX zerocopy
> > from the view of vhost-net.
> >
> > >
> > > A big question is how could you handle GSO packets from userspace/guests?
> >
> > Now by disabling VM's TSO and csum feature. XDP does not support GSO
> > packets.
> > However, this feature can be added once XDP supports it in the future.
> >
> > >
> > > >
> > > > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> > > > ---
> > > >  drivers/net/tun.c   | 165
> > > +++++++++++++++++++++++++++++++++++++++++++-
> > > >  drivers/vhost/net.c |  18 +++--
> > > >  2 files changed, 176 insertions(+), 7 deletions(-)
>
> [...]
>
> > > >
> > > >  static int peek_head_len(struct vhost_net_virtqueue *rvq, struct
> > > > sock
> > > > *sk)  {
> > > > +       struct socket *sock = sk->sk_socket;
> > > >         struct sk_buff *head;
> > > >         int len = 0;
> > > >         unsigned long flags;
> > > >
> > > > -       if (rvq->rx_ring)
> > > > -               return vhost_net_buf_peek(rvq);
> > > > +       if (rvq->rx_ring) {
> > > > +               len = vhost_net_buf_peek(rvq);
> > > > +               if (likely(len))
> > > > +                       return len;
> > > > +       }
> > > > +
> > > > +       if (sock->ops->peek_len)
> > > > +               return sock->ops->peek_len(sock);
> > >
> > > What prevents you from reusing the ptr_ring here? Then you don't need
> > > the above tricks.
> >
> > Thank you for your suggestion. I will consider how to reuse the ptr_ring.
>
> If ptr_ring is used to transfer xdp_descs, there is a problem: After some
> xdp_descs are obtained through xsk_tx_peek_desc(), the descs may fail
> to be added to ptr_ring. However, no API is available to implement the
> rollback function.

I don't understand, this issue seems to exist in the physical NIC as well?

We get more descriptors than the free slots in the NIC ring.

How did other NIC solve this issue?

Thanks

>
> Thanks
>
> >
> > >
> > > Thanks
> > >
> > >
> > > >
> > > >         spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
> > > >         head = skb_peek(&sk->sk_receive_queue);
> > > > --
> > > > 2.33.0
> > > >
>
wangyunjian Jan. 29, 2024, 11:10 a.m. UTC | #12
> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Monday, January 29, 2024 11:05 AM
> To: wangyunjian <wangyunjian@huawei.com>
> Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> davem@davemloft.net; magnus.karlsson@intel.com; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>
> Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> 
> On Sat, Jan 27, 2024 at 5:34 PM wangyunjian <wangyunjian@huawei.com>
> wrote:
> >
> > > > -----Original Message-----
> > > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > > Sent: Thursday, January 25, 2024 12:49 PM
> > > > To: wangyunjian <wangyunjian@huawei.com>
> > > > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com;
> > > > kuba@kernel.org; davem@davemloft.net; magnus.karlsson@intel.com;
> > > > netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> > > > kvm@vger.kernel.org; virtualization@lists.linux.dev; xudingke
> > > > <xudingke@huawei.com>
> > > > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> > > >
> > > > On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang
> > > <wangyunjian@huawei.com>
> > > > wrote:
> > > > >
> > > > > Now the zero-copy feature of AF_XDP socket is supported by some
> > > > > drivers, which can reduce CPU utilization on the xdp program.
> > > > > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> > > > >
> > > > > This patch tries to address this by:
> > > > > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > > > > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe
> empty.
> > > > > So add a check for empty vq's array in vhost_net_buf_produce().
> > > > > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > > > > - add tun_put_user_desc function to copy the Rx data to VM
> > > >
> > > > Code explains themselves, let's explain why you need to do this.
> > > >
> > > > 1) why you want to use peek_len
> > > > 2) for "vq's array", what does it mean?
> > > > 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so
> > > > I guess you meant TX zerocopy instead of RX (as I don't see codes
> > > > for
> > > > RX?)
> > >
> > > OK, I agree and use TX zerocopy instead of RX zerocopy. I meant RX
> > > zerocopy from the view of vhost-net.
> > >
> > > >
> > > > A big question is how could you handle GSO packets from
> userspace/guests?
> > >
> > > Now by disabling VM's TSO and csum feature. XDP does not support GSO
> > > packets.
> > > However, this feature can be added once XDP supports it in the future.
> > >
> > > >
> > > > >
> > > > > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> > > > > ---
> > > > >  drivers/net/tun.c   | 165
> > > > +++++++++++++++++++++++++++++++++++++++++++-
> > > > >  drivers/vhost/net.c |  18 +++--
> > > > >  2 files changed, 176 insertions(+), 7 deletions(-)
> >
> > [...]
> >
> > > > >
> > > > >  static int peek_head_len(struct vhost_net_virtqueue *rvq,
> > > > > struct sock
> > > > > *sk)  {
> > > > > +       struct socket *sock = sk->sk_socket;
> > > > >         struct sk_buff *head;
> > > > >         int len = 0;
> > > > >         unsigned long flags;
> > > > >
> > > > > -       if (rvq->rx_ring)
> > > > > -               return vhost_net_buf_peek(rvq);
> > > > > +       if (rvq->rx_ring) {
> > > > > +               len = vhost_net_buf_peek(rvq);
> > > > > +               if (likely(len))
> > > > > +                       return len;
> > > > > +       }
> > > > > +
> > > > > +       if (sock->ops->peek_len)
> > > > > +               return sock->ops->peek_len(sock);
> > > >
> > > > What prevents you from reusing the ptr_ring here? Then you don't
> > > > need the above tricks.
> > >
> > > Thank you for your suggestion. I will consider how to reuse the ptr_ring.
> >
> > If ptr_ring is used to transfer xdp_descs, there is a problem: After
> > some xdp_descs are obtained through xsk_tx_peek_desc(), the descs may
> > fail to be added to ptr_ring. However, no API is available to
> > implement the rollback function.
> 
> I don't understand, this issue seems to exist in the physical NIC as well?
> 
> We get more descriptors than the free slots in the NIC ring.
> 
> How did other NIC solve this issue?

Currently, physical NICs such as i40e, ice, ixgbe, igc, and mlx5 obtains
available NIC descriptors and then retrieve the same number of xsk
descriptors for processing.

Thanks

> 
> Thanks
> 
> >
> > Thanks
> >
> > >
> > > >
> > > > Thanks
> > > >
> > > >
> > > > >
> > > > >         spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
> > > > >         head = skb_peek(&sk->sk_receive_queue);
> > > > > --
> > > > > 2.33.0
> > > > >
> >
wangyunjian Jan. 29, 2024, 11:40 a.m. UTC | #13
> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Monday, January 29, 2024 11:03 AM
> To: wangyunjian <wangyunjian@huawei.com>
> Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> davem@davemloft.net; magnus.karlsson@intel.com; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>
> Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> 
> On Thu, Jan 25, 2024 at 8:54 PM wangyunjian <wangyunjian@huawei.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > Sent: Thursday, January 25, 2024 12:49 PM
> > > To: wangyunjian <wangyunjian@huawei.com>
> > > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com;
> > > kuba@kernel.org; davem@davemloft.net; magnus.karlsson@intel.com;
> > > netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> > > kvm@vger.kernel.org; virtualization@lists.linux.dev; xudingke
> > > <xudingke@huawei.com>
> > > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> > >
> > > On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang
> > > <wangyunjian@huawei.com>
> > > wrote:
> > > >
> > > > Now the zero-copy feature of AF_XDP socket is supported by some
> > > > drivers, which can reduce CPU utilization on the xdp program.
> > > > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> > > >
> > > > This patch tries to address this by:
> > > > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > > > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> > > > So add a check for empty vq's array in vhost_net_buf_produce().
> > > > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > > > - add tun_put_user_desc function to copy the Rx data to VM
> > >
> > > Code explains themselves, let's explain why you need to do this.
> > >
> > > 1) why you want to use peek_len
> > > 2) for "vq's array", what does it mean?
> > > 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so I
> > > guess you meant TX zerocopy instead of RX (as I don't see codes for
> > > RX?)
> >
> > OK, I agree and use TX zerocopy instead of RX zerocopy. I meant RX
> > zerocopy from the view of vhost-net.
> 
> Ok.
> 
> >
> > >
> > > A big question is how could you handle GSO packets from
> userspace/guests?
> >
> > Now by disabling VM's TSO and csum feature.
> 
> Btw, how could you do that?

By set network backend-specific options:
<driver name='vhost'>
	<host csum='off' gso='off' tso4='off' tso6='off' ecn='off' ufo='off' mrg_rxbuf='off'/>
    <guest csum='off' tso4='off' tso6='off' ecn='off' ufo='off'/>
</driver>

Thanks

> 
> Thanks
>
Jason Wang Jan. 30, 2024, 2:48 a.m. UTC | #14
On Mon, Jan 29, 2024 at 7:10 PM wangyunjian <wangyunjian@huawei.com> wrote:
>
> > -----Original Message-----
> > From: Jason Wang [mailto:jasowang@redhat.com]
> > Sent: Monday, January 29, 2024 11:05 AM
> > To: wangyunjian <wangyunjian@huawei.com>
> > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> > davem@davemloft.net; magnus.karlsson@intel.com; netdev@vger.kernel.org;
> > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>
> > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> >
> > On Sat, Jan 27, 2024 at 5:34 PM wangyunjian <wangyunjian@huawei.com>
> > wrote:
> > >
> > > > > -----Original Message-----
> > > > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > > > Sent: Thursday, January 25, 2024 12:49 PM
> > > > > To: wangyunjian <wangyunjian@huawei.com>
> > > > > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com;
> > > > > kuba@kernel.org; davem@davemloft.net; magnus.karlsson@intel.com;
> > > > > netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> > > > > kvm@vger.kernel.org; virtualization@lists.linux.dev; xudingke
> > > > > <xudingke@huawei.com>
> > > > > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> > > > >
> > > > > On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang
> > > > <wangyunjian@huawei.com>
> > > > > wrote:
> > > > > >
> > > > > > Now the zero-copy feature of AF_XDP socket is supported by some
> > > > > > drivers, which can reduce CPU utilization on the xdp program.
> > > > > > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> > > > > >
> > > > > > This patch tries to address this by:
> > > > > > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > > > > > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe
> > empty.
> > > > > > So add a check for empty vq's array in vhost_net_buf_produce().
> > > > > > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > > > > > - add tun_put_user_desc function to copy the Rx data to VM
> > > > >
> > > > > Code explains themselves, let's explain why you need to do this.
> > > > >
> > > > > 1) why you want to use peek_len
> > > > > 2) for "vq's array", what does it mean?
> > > > > 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so
> > > > > I guess you meant TX zerocopy instead of RX (as I don't see codes
> > > > > for
> > > > > RX?)
> > > >
> > > > OK, I agree and use TX zerocopy instead of RX zerocopy. I meant RX
> > > > zerocopy from the view of vhost-net.
> > > >
> > > > >
> > > > > A big question is how could you handle GSO packets from
> > userspace/guests?
> > > >
> > > > Now by disabling VM's TSO and csum feature. XDP does not support GSO
> > > > packets.
> > > > However, this feature can be added once XDP supports it in the future.
> > > >
> > > > >
> > > > > >
> > > > > > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> > > > > > ---
> > > > > >  drivers/net/tun.c   | 165
> > > > > +++++++++++++++++++++++++++++++++++++++++++-
> > > > > >  drivers/vhost/net.c |  18 +++--
> > > > > >  2 files changed, 176 insertions(+), 7 deletions(-)
> > >
> > > [...]
> > >
> > > > > >
> > > > > >  static int peek_head_len(struct vhost_net_virtqueue *rvq,
> > > > > > struct sock
> > > > > > *sk)  {
> > > > > > +       struct socket *sock = sk->sk_socket;
> > > > > >         struct sk_buff *head;
> > > > > >         int len = 0;
> > > > > >         unsigned long flags;
> > > > > >
> > > > > > -       if (rvq->rx_ring)
> > > > > > -               return vhost_net_buf_peek(rvq);
> > > > > > +       if (rvq->rx_ring) {
> > > > > > +               len = vhost_net_buf_peek(rvq);
> > > > > > +               if (likely(len))
> > > > > > +                       return len;
> > > > > > +       }
> > > > > > +
> > > > > > +       if (sock->ops->peek_len)
> > > > > > +               return sock->ops->peek_len(sock);
> > > > >
> > > > > What prevents you from reusing the ptr_ring here? Then you don't
> > > > > need the above tricks.
> > > >
> > > > Thank you for your suggestion. I will consider how to reuse the ptr_ring.
> > >
> > > If ptr_ring is used to transfer xdp_descs, there is a problem: After
> > > some xdp_descs are obtained through xsk_tx_peek_desc(), the descs may
> > > fail to be added to ptr_ring. However, no API is available to
> > > implement the rollback function.
> >
> > I don't understand, this issue seems to exist in the physical NIC as well?
> >
> > We get more descriptors than the free slots in the NIC ring.
> >
> > How did other NIC solve this issue?
>
> Currently, physical NICs such as i40e, ice, ixgbe, igc, and mlx5 obtains
> available NIC descriptors and then retrieve the same number of xsk
> descriptors for processing.

Any reason we can't do the same? ptr_ring should be much simpler than
NIC ring anyhow.

Thanks

>
> Thanks
>
> >
> > Thanks
> >
> > >
> > > Thanks
> > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > >
> > > > > >
> > > > > >         spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
> > > > > >         head = skb_peek(&sk->sk_receive_queue);
> > > > > > --
> > > > > > 2.33.0
> > > > > >
> > >
>
Jason Wang Jan. 30, 2024, 2:50 a.m. UTC | #15
On Mon, Jan 29, 2024 at 7:40 PM wangyunjian <wangyunjian@huawei.com> wrote:
>
> > -----Original Message-----
> > From: Jason Wang [mailto:jasowang@redhat.com]
> > Sent: Monday, January 29, 2024 11:03 AM
> > To: wangyunjian <wangyunjian@huawei.com>
> > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> > davem@davemloft.net; magnus.karlsson@intel.com; netdev@vger.kernel.org;
> > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>
> > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> >
> > On Thu, Jan 25, 2024 at 8:54 PM wangyunjian <wangyunjian@huawei.com>
> > wrote:
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > > Sent: Thursday, January 25, 2024 12:49 PM
> > > > To: wangyunjian <wangyunjian@huawei.com>
> > > > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com;
> > > > kuba@kernel.org; davem@davemloft.net; magnus.karlsson@intel.com;
> > > > netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> > > > kvm@vger.kernel.org; virtualization@lists.linux.dev; xudingke
> > > > <xudingke@huawei.com>
> > > > Subject: Re: [PATCH net-next 2/2] tun: AF_XDP Rx zero-copy support
> > > >
> > > > On Wed, Jan 24, 2024 at 5:38 PM Yunjian Wang
> > > > <wangyunjian@huawei.com>
> > > > wrote:
> > > > >
> > > > > Now the zero-copy feature of AF_XDP socket is supported by some
> > > > > drivers, which can reduce CPU utilization on the xdp program.
> > > > > This patch set allows tun to support AF_XDP Rx zero-copy feature.
> > > > >
> > > > > This patch tries to address this by:
> > > > > - Use peek_len to consume a xsk->desc and get xsk->desc length.
> > > > > - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> > > > > So add a check for empty vq's array in vhost_net_buf_produce().
> > > > > - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> > > > > - add tun_put_user_desc function to copy the Rx data to VM
> > > >
> > > > Code explains themselves, let's explain why you need to do this.
> > > >
> > > > 1) why you want to use peek_len
> > > > 2) for "vq's array", what does it mean?
> > > > 3) from the view of TUN/TAP tun_put_user_desc() is the TX path, so I
> > > > guess you meant TX zerocopy instead of RX (as I don't see codes for
> > > > RX?)
> > >
> > > OK, I agree and use TX zerocopy instead of RX zerocopy. I meant RX
> > > zerocopy from the view of vhost-net.
> >
> > Ok.
> >
> > >
> > > >
> > > > A big question is how could you handle GSO packets from
> > userspace/guests?
> > >
> > > Now by disabling VM's TSO and csum feature.
> >
> > Btw, how could you do that?
>
> By set network backend-specific options:
> <driver name='vhost'>
>         <host csum='off' gso='off' tso4='off' tso6='off' ecn='off' ufo='off' mrg_rxbuf='off'/>
>     <guest csum='off' tso4='off' tso6='off' ecn='off' ufo='off'/>
> </driver>

This is the mgmt work, but the problem is what happens if GSO is not
disabled in the guest, or is there a way to:

1) forcing the guest GSO to be off
2) a graceful fallback

Thanks

>
> Thanks
>
> >
> > Thanks
> >
>
diff mbox series

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index afa5497f7c35..248b0f8e07d1 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -77,6 +77,7 @@ 
 #include <net/ax25.h>
 #include <net/rose.h>
 #include <net/6lowpan.h>
+#include <net/xdp_sock_drv.h>
 
 #include <linux/uaccess.h>
 #include <linux/proc_fs.h>
@@ -145,6 +146,10 @@  struct tun_file {
 	struct tun_struct *detached;
 	struct ptr_ring tx_ring;
 	struct xdp_rxq_info xdp_rxq;
+	struct xdp_desc desc;
+	/* protects xsk pool */
+	spinlock_t pool_lock;
+	struct xsk_buff_pool *pool;
 };
 
 struct tun_page {
@@ -208,6 +213,8 @@  struct tun_struct {
 	struct bpf_prog __rcu *xdp_prog;
 	struct tun_prog __rcu *steering_prog;
 	struct tun_prog __rcu *filter_prog;
+	/* tracks AF_XDP ZC enabled queues */
+	unsigned long *af_xdp_zc_qps;
 	struct ethtool_link_ksettings link_ksettings;
 	/* init args */
 	struct file *file;
@@ -795,6 +802,8 @@  static int tun_attach(struct tun_struct *tun, struct file *file,
 
 	tfile->queue_index = tun->numqueues;
 	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
+	tfile->desc.len = 0;
+	tfile->pool = NULL;
 
 	if (tfile->detached) {
 		/* Re-attach detached tfile, updating XDP queue_index */
@@ -989,6 +998,13 @@  static int tun_net_init(struct net_device *dev)
 		return err;
 	}
 
+	tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES, GFP_KERNEL);
+	if (!tun->af_xdp_zc_qps) {
+		security_tun_dev_free_security(tun->security);
+		free_percpu(dev->tstats);
+		return -ENOMEM;
+	}
+
 	tun_flow_init(tun);
 
 	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
@@ -1009,6 +1025,7 @@  static int tun_net_init(struct net_device *dev)
 		tun_flow_uninit(tun);
 		security_tun_dev_free_security(tun->security);
 		free_percpu(dev->tstats);
+		bitmap_free(tun->af_xdp_zc_qps);
 		return err;
 	}
 	return 0;
@@ -1222,11 +1239,77 @@  static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	return 0;
 }
 
+static int tun_xsk_pool_enable(struct net_device *netdev,
+			       struct xsk_buff_pool *pool,
+			       u16 qid)
+{
+	struct tun_struct *tun = netdev_priv(netdev);
+	struct tun_file *tfile;
+	unsigned long flags;
+
+	rcu_read_lock();
+	tfile = rtnl_dereference(tun->tfiles[qid]);
+	if (!tfile) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+
+	spin_lock_irqsave(&tfile->pool_lock, flags);
+	xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
+	tfile->pool = pool;
+	spin_unlock_irqrestore(&tfile->pool_lock, flags);
+
+	rcu_read_unlock();
+	set_bit(qid, tun->af_xdp_zc_qps);
+
+	return 0;
+}
+
+static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
+{
+	struct tun_struct *tun = netdev_priv(netdev);
+	struct tun_file *tfile;
+	unsigned long flags;
+
+	if (!test_bit(qid, tun->af_xdp_zc_qps))
+		return 0;
+
+	clear_bit(qid, tun->af_xdp_zc_qps);
+
+	rcu_read_lock();
+	tfile = rtnl_dereference(tun->tfiles[qid]);
+	if (!tfile) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	spin_lock_irqsave(&tfile->pool_lock, flags);
+	if (tfile->desc.len) {
+		xsk_tx_completed(tfile->pool, 1);
+		tfile->desc.len = 0;
+	}
+	tfile->pool = NULL;
+	spin_unlock_irqrestore(&tfile->pool_lock, flags);
+
+	rcu_read_unlock();
+	return 0;
+}
+
+int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
+		       u16 qid)
+{
+	return pool ? tun_xsk_pool_enable(dev, pool, qid) :
+		tun_xsk_pool_disable(dev, qid);
+}
+
 static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return tun_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_SETUP_XSK_POOL:
+		return tun_xsk_pool_setup(dev, xdp->xsk.pool,
+					   xdp->xsk.queue_id);
 	default:
 		return -EINVAL;
 	}
@@ -1331,6 +1414,19 @@  static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	return nxmit;
 }
 
+static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
+
+	rcu_read_lock();
+	tfile = rcu_dereference(tun->tfiles[qid]);
+	if (tfile)
+		__tun_xdp_flush_tfile(tfile);
+	rcu_read_unlock();
+	return 0;
+}
+
 static const struct net_device_ops tap_netdev_ops = {
 	.ndo_init		= tun_net_init,
 	.ndo_uninit		= tun_net_uninit,
@@ -1347,6 +1443,7 @@  static const struct net_device_ops tap_netdev_ops = {
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_bpf		= tun_xdp,
 	.ndo_xdp_xmit		= tun_xdp_xmit,
+	.ndo_xsk_wakeup		= tun_xsk_wakeup,
 	.ndo_change_carrier	= tun_net_change_carrier,
 };
 
@@ -1404,7 +1501,8 @@  static void tun_net_initialize(struct net_device *dev)
 		/* Currently tun does not support XDP, only tap does. */
 		dev->xdp_features = NETDEV_XDP_ACT_BASIC |
 				    NETDEV_XDP_ACT_REDIRECT |
-				    NETDEV_XDP_ACT_NDO_XMIT;
+				    NETDEV_XDP_ACT_NDO_XMIT |
+				    NETDEV_XDP_ACT_XSK_ZEROCOPY;
 
 		break;
 	}
@@ -2213,6 +2311,37 @@  static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 	return ptr;
 }
 
+static ssize_t tun_put_user_desc(struct tun_struct *tun,
+				 struct tun_file *tfile,
+				 struct xdp_desc *desc,
+				 struct iov_iter *iter)
+{
+	size_t size = desc->len;
+	int vnet_hdr_sz = 0;
+	size_t ret;
+
+	if (tun->flags & IFF_VNET_HDR) {
+		struct virtio_net_hdr_mrg_rxbuf gso = { 0 };
+
+		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
+		if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
+			return -EINVAL;
+		if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
+			     sizeof(gso)))
+			return -EFAULT;
+		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
+	}
+
+	ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr),
+			   size, iter) + vnet_hdr_sz;
+
+	preempt_disable();
+	dev_sw_netstats_tx_add(tun->dev, 1, ret);
+	preempt_enable();
+
+	return ret;
+}
+
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			   struct iov_iter *to,
 			   int noblock, void *ptr)
@@ -2226,6 +2355,22 @@  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	}
 
 	if (!ptr) {
+		/* Read frames from xsk's desc */
+		if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
+			spin_lock(&tfile->pool_lock);
+			if (tfile->pool) {
+				ret = tun_put_user_desc(tun, tfile, &tfile->desc, to);
+				xsk_tx_completed(tfile->pool, 1);
+				if (xsk_uses_need_wakeup(tfile->pool))
+					xsk_set_tx_need_wakeup(tfile->pool);
+				tfile->desc.len = 0;
+			} else {
+				ret = -EBADFD;
+			}
+			spin_unlock(&tfile->pool_lock);
+			return ret;
+		}
+
 		/* Read frames from ring */
 		ptr = tun_ring_recv(tfile, noblock, &err);
 		if (!ptr)
@@ -2311,6 +2456,7 @@  static void tun_free_netdev(struct net_device *dev)
 
 	BUG_ON(!(list_empty(&tun->disabled)));
 
+	bitmap_free(tun->af_xdp_zc_qps);
 	free_percpu(dev->tstats);
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
@@ -2666,7 +2812,19 @@  static int tun_peek_len(struct socket *sock)
 	if (!tun)
 		return 0;
 
-	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
+	if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
+		spin_lock(&tfile->pool_lock);
+		if (tfile->pool && xsk_tx_peek_desc(tfile->pool, &tfile->desc)) {
+			xsk_tx_release(tfile->pool);
+			ret = tfile->desc.len;
+			/* The length of desc must be greater than 0 */
+			if (!ret)
+				xsk_tx_completed(tfile->pool, 1);
+		}
+		spin_unlock(&tfile->pool_lock);
+	} else {
+		ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
+	}
 	tun_put(tun);
 
 	return ret;
@@ -3469,8 +3627,11 @@  static int tun_chr_open(struct inode *inode, struct file * file)
 
 	mutex_init(&tfile->napi_mutex);
 	RCU_INIT_POINTER(tfile->tun, NULL);
+	spin_lock_init(&tfile->pool_lock);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
+	tfile->pool = NULL;
+	tfile->desc.len = 0;
 
 	init_waitqueue_head(&tfile->socket.wq.wait);
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f2ed7167c848..a1f143ad2341 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -169,9 +169,10 @@  static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
 
 static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
 {
-	void *ret = vhost_net_buf_get_ptr(rxq);
-	++rxq->head;
-	return ret;
+	if (rxq->tail == rxq->head)
+		return NULL;
+
+	return rxq->queue[rxq->head++];
 }
 
 static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
@@ -993,12 +994,19 @@  static void handle_tx(struct vhost_net *net)
 
 static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 {
+	struct socket *sock = sk->sk_socket;
 	struct sk_buff *head;
 	int len = 0;
 	unsigned long flags;
 
-	if (rvq->rx_ring)
-		return vhost_net_buf_peek(rvq);
+	if (rvq->rx_ring) {
+		len = vhost_net_buf_peek(rvq);
+		if (likely(len))
+			return len;
+	}
+
+	if (sock->ops->peek_len)
+		return sock->ops->peek_len(sock);
 
 	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
 	head = skb_peek(&sk->sk_receive_queue);