diff mbox series

[net-next,v2,3/3] tun: AF_XDP Tx zero-copy support

Message ID 1709118356-133960-1-git-send-email-wangyunjian@huawei.com (mailing list archive)
State New, archived
Headers show
Series tun: AF_XDP Tx zero-copy support | expand

Commit Message

wangyunjian Feb. 28, 2024, 11:05 a.m. UTC
This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
which can significantly reduce CPU utilization for XDP programs.

Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
ring has been utilized to queue different types of pointers by encoding
the type into the lower bits. Therefore, we introduce a new flag,
TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP descriptors
and differentiate them from XDP buffers and sk_buffs. Additionally, a
spin lock is added for enabling and disabling operations on the xsk pool.

The performance testing was performed on a Intel E5-2620 2.40GHz machine.
Traffic were generated/send through TUN(testpmd txonly with AF_XDP)
to VM (testpmd rxonly in guest).

+------+---------+---------+---------+
|      |   copy  |zero-copy| speedup |
+------+---------+---------+---------+
| UDP  |   Mpps  |   Mpps  |    %    |
| 64   |   2.5   |   4.0   |   60%   |
| 512  |   2.1   |   3.6   |   71%   |
| 1024 |   1.9   |   3.3   |   73%   |
+------+---------+---------+---------+

Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
---
 drivers/net/tun.c      | 177 +++++++++++++++++++++++++++++++++++++++--
 drivers/vhost/net.c    |   4 +
 include/linux/if_tun.h |  32 ++++++++
 3 files changed, 208 insertions(+), 5 deletions(-)

Comments

kernel test robot Feb. 29, 2024, 10:24 a.m. UTC | #1
Hi Yunjian,

kernel test robot noticed the following build errors:

[auto build test ERROR on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Yunjian-Wang/xsk-Remove-non-zero-dma_page-check-in-xp_assign_dev/20240228-190840
base:   net-next/main
patch link:    https://lore.kernel.org/r/1709118356-133960-1-git-send-email-wangyunjian%40huawei.com
patch subject: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
config: i386-randconfig-012-20240229 (https://download.01.org/0day-ci/archive/20240229/202402291828.G9c5tW50-lkp@intel.com/config)
compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240229/202402291828.G9c5tW50-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402291828.G9c5tW50-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/vhost/net.c: In function 'vhost_net_buf_peek_len':
>> drivers/vhost/net.c:206:27: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types]
      struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
                              ^~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +206 drivers/vhost/net.c

   198	
   199	static int vhost_net_buf_peek_len(void *ptr)
   200	{
   201		if (tun_is_xdp_frame(ptr)) {
   202			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
   203	
   204			return xdpf->len;
   205		} else if (tun_is_xdp_desc_frame(ptr)) {
 > 206			struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
   207	
   208			return desc->len;
   209		}
   210	
   211		return __skb_array_len_with_tag(ptr);
   212	}
   213
Paolo Abeni Feb. 29, 2024, 11:12 a.m. UTC | #2
On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
>  	}
>  }
>  
> +static void tun_peek_xsk(struct tun_file *tfile)
> +{
> +	struct xsk_buff_pool *pool;
> +	u32 i, batch, budget;
> +	void *frame;
> +
> +	if (!ptr_ring_empty(&tfile->tx_ring))
> +		return;
> +
> +	spin_lock(&tfile->pool_lock);
> +	pool = tfile->xsk_pool;
> +	if (!pool) {
> +		spin_unlock(&tfile->pool_lock);
> +		return;
> +	}
> +
> +	if (tfile->nb_descs) {
> +		xsk_tx_completed(pool, tfile->nb_descs);
> +		if (xsk_uses_need_wakeup(pool))
> +			xsk_set_tx_need_wakeup(pool);
> +	}
> +
> +	spin_lock(&tfile->tx_ring.producer_lock);
> +	budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> +
> +	batch = xsk_tx_peek_release_desc_batch(pool, budget);
> +	if (!batch) {

This branch looks like an unneeded "optimization". The generic loop
below should have the same effect with no measurable perf delta - and
smaller code. Just remove this.

> +		tfile->nb_descs = 0;
> +		spin_unlock(&tfile->tx_ring.producer_lock);
> +		spin_unlock(&tfile->pool_lock);
> +		return;
> +	}
> +
> +	tfile->nb_descs = batch;
> +	for (i = 0; i < batch; i++) {
> +		/* Encode the XDP DESC flag into lowest bit for consumer to differ
> +		 * XDP desc from XDP buffer and sk_buff.
> +		 */
> +		frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> +		/* The budget must be less than or equal to tx_ring.size,
> +		 * so enqueuing will not fail.
> +		 */
> +		__ptr_ring_produce(&tfile->tx_ring, frame);
> +	}
> +	spin_unlock(&tfile->tx_ring.producer_lock);
> +	spin_unlock(&tfile->pool_lock);

More related to the general design: it looks wrong. What if
get_rx_bufs() will fail (ENOBUF) after successful peeking? With no more
incoming packets, later peek will return 0 and it looks like that the
half-processed packets will stay in the ring forever???

I think the 'ring produce' part should be moved into tun_do_read().

Cheers,

Paolo
wangyunjian Feb. 29, 2024, 1:15 p.m. UTC | #3
> -----Original Message-----
> From: Paolo Abeni [mailto:pabeni@redhat.com]
> Sent: Thursday, February 29, 2024 7:13 PM
> To: wangyunjian <wangyunjian@huawei.com>; mst@redhat.com;
> willemdebruijn.kernel@gmail.com; jasowang@redhat.com; kuba@kernel.org;
> bjorn@kernel.org; magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> jonathan.lemon@gmail.com; davem@davemloft.net
> Cc: bpf@vger.kernel.org; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> <liwei395@huawei.com>
> Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> 
> On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> >  	}
> >  }
> >
> > +static void tun_peek_xsk(struct tun_file *tfile) {
> > +	struct xsk_buff_pool *pool;
> > +	u32 i, batch, budget;
> > +	void *frame;
> > +
> > +	if (!ptr_ring_empty(&tfile->tx_ring))
> > +		return;
> > +
> > +	spin_lock(&tfile->pool_lock);
> > +	pool = tfile->xsk_pool;
> > +	if (!pool) {
> > +		spin_unlock(&tfile->pool_lock);
> > +		return;
> > +	}
> > +
> > +	if (tfile->nb_descs) {
> > +		xsk_tx_completed(pool, tfile->nb_descs);
> > +		if (xsk_uses_need_wakeup(pool))
> > +			xsk_set_tx_need_wakeup(pool);
> > +	}
> > +
> > +	spin_lock(&tfile->tx_ring.producer_lock);
> > +	budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> > +
> > +	batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > +	if (!batch) {
> 
> This branch looks like an unneeded "optimization". The generic loop below
> should have the same effect with no measurable perf delta - and smaller code.
> Just remove this.

OK, I will update it, thanks.

> 
> > +		tfile->nb_descs = 0;
> > +		spin_unlock(&tfile->tx_ring.producer_lock);
> > +		spin_unlock(&tfile->pool_lock);
> > +		return;
> > +	}
> > +
> > +	tfile->nb_descs = batch;
> > +	for (i = 0; i < batch; i++) {
> > +		/* Encode the XDP DESC flag into lowest bit for consumer to differ
> > +		 * XDP desc from XDP buffer and sk_buff.
> > +		 */
> > +		frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > +		/* The budget must be less than or equal to tx_ring.size,
> > +		 * so enqueuing will not fail.
> > +		 */
> > +		__ptr_ring_produce(&tfile->tx_ring, frame);
> > +	}
> > +	spin_unlock(&tfile->tx_ring.producer_lock);
> > +	spin_unlock(&tfile->pool_lock);
> 
> More related to the general design: it looks wrong. What if
> get_rx_bufs() will fail (ENOBUF) after successful peeking? With no more
> incoming packets, later peek will return 0 and it looks like that the
> half-processed packets will stay in the ring forever???

The vhost_net_rx_peek_head_len function obtains the packet length
but does not consume it. The packet is still in the ring. The later peek
will reuse it.

> 
> I think the 'ring produce' part should be moved into tun_do_read().

Thank you for your suggestion. I will consider that.

> 
> Cheers,
> 
> Paolo
kernel test robot Feb. 29, 2024, 3:44 p.m. UTC | #4
Hi Yunjian,

kernel test robot noticed the following build errors:

[auto build test ERROR on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Yunjian-Wang/xsk-Remove-non-zero-dma_page-check-in-xp_assign_dev/20240228-190840
base:   net-next/main
patch link:    https://lore.kernel.org/r/1709118356-133960-1-git-send-email-wangyunjian%40huawei.com
patch subject: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
config: microblaze-randconfig-r131-20240229 (https://download.01.org/0day-ci/archive/20240229/202402292345.a49gfJLj-lkp@intel.com/config)
compiler: microblaze-linux-gcc (GCC) 13.2.0
reproduce: (https://download.01.org/0day-ci/archive/20240229/202402292345.a49gfJLj-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402292345.a49gfJLj-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/vhost/net.c: In function 'vhost_net_buf_peek_len':
>> drivers/vhost/net.c:206:41: error: initialization of 'struct xdp_desc *' from incompatible pointer type 'struct xdp_frame *' [-Werror=incompatible-pointer-types]
     206 |                 struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
         |                                         ^~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +206 drivers/vhost/net.c

   198	
   199	static int vhost_net_buf_peek_len(void *ptr)
   200	{
   201		if (tun_is_xdp_frame(ptr)) {
   202			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
   203	
   204			return xdpf->len;
   205		} else if (tun_is_xdp_desc_frame(ptr)) {
 > 206			struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
   207	
   208			return desc->len;
   209		}
   210	
   211		return __skb_array_len_with_tag(ptr);
   212	}
   213
wangyunjian March 1, 2024, 11:45 a.m. UTC | #5
> -----Original Message-----
> From: Paolo Abeni [mailto:pabeni@redhat.com]
> Sent: Thursday, February 29, 2024 7:13 PM
> To: wangyunjian <wangyunjian@huawei.com>; mst@redhat.com;
> willemdebruijn.kernel@gmail.com; jasowang@redhat.com; kuba@kernel.org;
> bjorn@kernel.org; magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> jonathan.lemon@gmail.com; davem@davemloft.net
> Cc: bpf@vger.kernel.org; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> <liwei395@huawei.com>
> Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> 
> On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> >  	}
> >  }
> >
> > +static void tun_peek_xsk(struct tun_file *tfile) {
> > +	struct xsk_buff_pool *pool;
> > +	u32 i, batch, budget;
> > +	void *frame;
> > +
> > +	if (!ptr_ring_empty(&tfile->tx_ring))
> > +		return;
> > +
> > +	spin_lock(&tfile->pool_lock);
> > +	pool = tfile->xsk_pool;
> > +	if (!pool) {
> > +		spin_unlock(&tfile->pool_lock);
> > +		return;
> > +	}
> > +
> > +	if (tfile->nb_descs) {
> > +		xsk_tx_completed(pool, tfile->nb_descs);
> > +		if (xsk_uses_need_wakeup(pool))
> > +			xsk_set_tx_need_wakeup(pool);
> > +	}
> > +
> > +	spin_lock(&tfile->tx_ring.producer_lock);
> > +	budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> > +
> > +	batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > +	if (!batch) {
> 
> This branch looks like an unneeded "optimization". The generic loop below
> should have the same effect with no measurable perf delta - and smaller code.
> Just remove this.
> 
> > +		tfile->nb_descs = 0;
> > +		spin_unlock(&tfile->tx_ring.producer_lock);
> > +		spin_unlock(&tfile->pool_lock);
> > +		return;
> > +	}
> > +
> > +	tfile->nb_descs = batch;
> > +	for (i = 0; i < batch; i++) {
> > +		/* Encode the XDP DESC flag into lowest bit for consumer to differ
> > +		 * XDP desc from XDP buffer and sk_buff.
> > +		 */
> > +		frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > +		/* The budget must be less than or equal to tx_ring.size,
> > +		 * so enqueuing will not fail.
> > +		 */
> > +		__ptr_ring_produce(&tfile->tx_ring, frame);
> > +	}
> > +	spin_unlock(&tfile->tx_ring.producer_lock);
> > +	spin_unlock(&tfile->pool_lock);
> 
> More related to the general design: it looks wrong. What if
> get_rx_bufs() will fail (ENOBUF) after successful peeking? With no more
> incoming packets, later peek will return 0 and it looks like that the
> half-processed packets will stay in the ring forever???
> 
> I think the 'ring produce' part should be moved into tun_do_read().

Currently, the vhost-net obtains a batch descriptors/sk_buffs from the
ptr_ring and enqueue the batch descriptors/sk_buffs to the virtqueue'queue,
and then consumes the descriptors/sk_buffs from the virtqueue'queue in
sequence. As a result, TUN does not know whether the batch descriptors have
been used up, and thus does not know when to return the batch descriptors.

So, I think it's reasonable that when vhost-net checks ptr_ring is empty,
it calls peek_len to get new xsk's descs and return the descriptors.

Thanks
> 
> Cheers,
> 
> Paolo
Michael S. Tsirkin March 1, 2024, 11:53 a.m. UTC | #6
On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > -----Original Message-----
> > From: Paolo Abeni [mailto:pabeni@redhat.com]
> > Sent: Thursday, February 29, 2024 7:13 PM
> > To: wangyunjian <wangyunjian@huawei.com>; mst@redhat.com;
> > willemdebruijn.kernel@gmail.com; jasowang@redhat.com; kuba@kernel.org;
> > bjorn@kernel.org; magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> > jonathan.lemon@gmail.com; davem@davemloft.net
> > Cc: bpf@vger.kernel.org; netdev@vger.kernel.org;
> > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> > <liwei395@huawei.com>
> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> > 
> > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > >  	}
> > >  }
> > >
> > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > +	struct xsk_buff_pool *pool;
> > > +	u32 i, batch, budget;
> > > +	void *frame;
> > > +
> > > +	if (!ptr_ring_empty(&tfile->tx_ring))
> > > +		return;
> > > +
> > > +	spin_lock(&tfile->pool_lock);
> > > +	pool = tfile->xsk_pool;
> > > +	if (!pool) {
> > > +		spin_unlock(&tfile->pool_lock);
> > > +		return;
> > > +	}
> > > +
> > > +	if (tfile->nb_descs) {
> > > +		xsk_tx_completed(pool, tfile->nb_descs);
> > > +		if (xsk_uses_need_wakeup(pool))
> > > +			xsk_set_tx_need_wakeup(pool);
> > > +	}
> > > +
> > > +	spin_lock(&tfile->tx_ring.producer_lock);
> > > +	budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> > > +
> > > +	batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > +	if (!batch) {
> > 
> > This branch looks like an unneeded "optimization". The generic loop below
> > should have the same effect with no measurable perf delta - and smaller code.
> > Just remove this.
> > 
> > > +		tfile->nb_descs = 0;
> > > +		spin_unlock(&tfile->tx_ring.producer_lock);
> > > +		spin_unlock(&tfile->pool_lock);
> > > +		return;
> > > +	}
> > > +
> > > +	tfile->nb_descs = batch;
> > > +	for (i = 0; i < batch; i++) {
> > > +		/* Encode the XDP DESC flag into lowest bit for consumer to differ
> > > +		 * XDP desc from XDP buffer and sk_buff.
> > > +		 */
> > > +		frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > +		/* The budget must be less than or equal to tx_ring.size,
> > > +		 * so enqueuing will not fail.
> > > +		 */
> > > +		__ptr_ring_produce(&tfile->tx_ring, frame);
> > > +	}
> > > +	spin_unlock(&tfile->tx_ring.producer_lock);
> > > +	spin_unlock(&tfile->pool_lock);
> > 
> > More related to the general design: it looks wrong. What if
> > get_rx_bufs() will fail (ENOBUF) after successful peeking? With no more
> > incoming packets, later peek will return 0 and it looks like that the
> > half-processed packets will stay in the ring forever???
> > 
> > I think the 'ring produce' part should be moved into tun_do_read().
> 
> Currently, the vhost-net obtains a batch descriptors/sk_buffs from the
> ptr_ring and enqueue the batch descriptors/sk_buffs to the virtqueue'queue,
> and then consumes the descriptors/sk_buffs from the virtqueue'queue in
> sequence. As a result, TUN does not know whether the batch descriptors have
> been used up, and thus does not know when to return the batch descriptors.
> 
> So, I think it's reasonable that when vhost-net checks ptr_ring is empty,
> it calls peek_len to get new xsk's descs and return the descriptors.
> 
> Thanks

What you need to think about is that if you peek, another call
in parallel can get the same value at the same time.


> > 
> > Cheers,
> > 
> > Paolo
>
Maciej Fijalkowski March 1, 2024, 2:11 p.m. UTC | #7
On Wed, Feb 28, 2024 at 07:05:56PM +0800, Yunjian Wang wrote:
> This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> which can significantly reduce CPU utilization for XDP programs.

Why no Rx ZC support though? What will happen if I try rxdrop xdpsock
against tun with this patch? You clearly allow for that.

> 
> Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
> ring has been utilized to queue different types of pointers by encoding
> the type into the lower bits. Therefore, we introduce a new flag,
> TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP descriptors
> and differentiate them from XDP buffers and sk_buffs. Additionally, a
> spin lock is added for enabling and disabling operations on the xsk pool.
> 
> The performance testing was performed on a Intel E5-2620 2.40GHz machine.
> Traffic were generated/send through TUN(testpmd txonly with AF_XDP)
> to VM (testpmd rxonly in guest).
> 
> +------+---------+---------+---------+
> |      |   copy  |zero-copy| speedup |
> +------+---------+---------+---------+
> | UDP  |   Mpps  |   Mpps  |    %    |
> | 64   |   2.5   |   4.0   |   60%   |
> | 512  |   2.1   |   3.6   |   71%   |
> | 1024 |   1.9   |   3.3   |   73%   |
> +------+---------+---------+---------+
> 
> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> ---
>  drivers/net/tun.c      | 177 +++++++++++++++++++++++++++++++++++++++--
>  drivers/vhost/net.c    |   4 +
>  include/linux/if_tun.h |  32 ++++++++
>  3 files changed, 208 insertions(+), 5 deletions(-)
>
Willem de Bruijn March 1, 2024, 6:40 p.m. UTC | #8
Maciej Fijalkowski wrote:
> On Wed, Feb 28, 2024 at 07:05:56PM +0800, Yunjian Wang wrote:
> > This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> > which can significantly reduce CPU utilization for XDP programs.
> 
> Why no Rx ZC support though? What will happen if I try rxdrop xdpsock
> against tun with this patch? You clearly allow for that.

This is AF_XDP receive zerocopy, right?

The naming is always confusing with tun, but even though from a tun
PoV this happens on ndo_start_xmit, it is the AF_XDP equivalent to
tun_put_user.

So the implementation is more like other device's Rx ZC.

I would have preferred that name, but I think Jason asked for this
and given tun's weird status, there is something bo said for either.
Jason Wang March 4, 2024, 6:55 a.m. UTC | #9
On Wed, Feb 28, 2024 at 7:06 PM Yunjian Wang <wangyunjian@huawei.com> wrote:
>
> This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> which can significantly reduce CPU utilization for XDP programs.
>
> Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
> ring has been utilized to queue different types of pointers by encoding
> the type into the lower bits. Therefore, we introduce a new flag,
> TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP descriptors
> and differentiate them from XDP buffers and sk_buffs. Additionally, a
> spin lock is added for enabling and disabling operations on the xsk pool.
>
> The performance testing was performed on a Intel E5-2620 2.40GHz machine.
> Traffic were generated/send through TUN(testpmd txonly with AF_XDP)
> to VM (testpmd rxonly in guest).
>
> +------+---------+---------+---------+
> |      |   copy  |zero-copy| speedup |
> +------+---------+---------+---------+
> | UDP  |   Mpps  |   Mpps  |    %    |
> | 64   |   2.5   |   4.0   |   60%   |
> | 512  |   2.1   |   3.6   |   71%   |
> | 1024 |   1.9   |   3.3   |   73%   |
> +------+---------+---------+---------+
>
> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> ---
>  drivers/net/tun.c      | 177 +++++++++++++++++++++++++++++++++++++++--
>  drivers/vhost/net.c    |   4 +
>  include/linux/if_tun.h |  32 ++++++++
>  3 files changed, 208 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index bc80fc1d576e..7f4ff50b532c 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -63,6 +63,7 @@
>  #include <net/rtnetlink.h>
>  #include <net/sock.h>
>  #include <net/xdp.h>
> +#include <net/xdp_sock_drv.h>
>  #include <net/ip_tunnels.h>
>  #include <linux/seq_file.h>
>  #include <linux/uio.h>
> @@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct net_device *dev,
>                                        struct ethtool_link_ksettings *cmd);
>
>  #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> +#define TUN_XDP_BATCH 64
>
>  /* TUN device flags */
>
> @@ -146,6 +148,9 @@ struct tun_file {
>         struct tun_struct *detached;
>         struct ptr_ring tx_ring;
>         struct xdp_rxq_info xdp_rxq;
> +       struct xsk_buff_pool *xsk_pool;
> +       spinlock_t pool_lock;   /* Protects xsk pool enable/disable */
> +       u32 nb_descs;
>  };
>
>  struct tun_page {
> @@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr)
>                 struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
>
>                 xdp_return_frame(xdpf);
> +       } else if (tun_is_xdp_desc_frame(ptr)) {
> +               return;
>         } else {
>                 __skb_array_destroy_skb(ptr);
>         }
> @@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile)
>         skb_queue_purge(&tfile->sk.sk_error_queue);
>  }
>
> +static void tun_set_xsk_pool(struct tun_file *tfile, struct xsk_buff_pool *pool)
> +{
> +       if (!pool)
> +               return;
> +
> +       spin_lock(&tfile->pool_lock);
> +       xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> +       tfile->xsk_pool = pool;
> +       spin_unlock(&tfile->pool_lock);
> +}
> +
> +static void tun_clean_xsk_pool(struct tun_file *tfile)
> +{
> +       spin_lock(&tfile->pool_lock);
> +       if (tfile->xsk_pool) {
> +               void *ptr;
> +
> +               while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> +                       tun_ptr_free(ptr);
> +
> +               if (tfile->nb_descs) {
> +                       xsk_tx_completed(tfile->xsk_pool, tfile->nb_descs);
> +                       if (xsk_uses_need_wakeup(tfile->xsk_pool))
> +                               xsk_set_tx_need_wakeup(tfile->xsk_pool);
> +                       tfile->nb_descs = 0;
> +               }
> +               tfile->xsk_pool = NULL;
> +       }
> +       spin_unlock(&tfile->pool_lock);
> +}
> +
>  static void __tun_detach(struct tun_file *tfile, bool clean)
>  {
>         struct tun_file *ntfile;
> @@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
>                 u16 index = tfile->queue_index;
>                 BUG_ON(index >= tun->numqueues);
>
> +               ntfile = rtnl_dereference(tun->tfiles[tun->numqueues - 1]);
> +               /* Stop xsk zc xmit */
> +               tun_clean_xsk_pool(tfile);
> +               tun_clean_xsk_pool(ntfile);
> +
>                 rcu_assign_pointer(tun->tfiles[index],
>                                    tun->tfiles[tun->numqueues - 1]);
>                 ntfile = rtnl_dereference(tun->tfiles[index]);
> @@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
>                 tun_flow_delete_by_queue(tun, tun->numqueues + 1);
>                 /* Drop read queue */
>                 tun_queue_purge(tfile);
> +               tun_set_xsk_pool(ntfile, xsk_get_pool_from_qid(tun->dev, index));
>                 tun_set_real_num_queues(tun);
>         } else if (tfile->detached && clean) {
>                 tun = tun_enable_queue(tfile);
> @@ -801,6 +845,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>
>                 if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
>                         tfile->xdp_rxq.queue_index = tfile->queue_index;
> +               tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev, tfile->queue_index));
>         } else {
>                 /* Setup XDP RX-queue info, for new tfile getting attached */
>                 err = xdp_rxq_info_reg(&tfile->xdp_rxq,
> @@ -1221,11 +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>         return 0;
>  }
>
> +static int tun_xsk_pool_enable(struct net_device *netdev,
> +                              struct xsk_buff_pool *pool,
> +                              u16 qid)
> +{
> +       struct tun_struct *tun = netdev_priv(netdev);
> +       struct tun_file *tfile;
> +
> +       if (qid >= tun->numqueues)
> +               return -EINVAL;
> +
> +       tfile = rtnl_dereference(tun->tfiles[qid]);
> +       tun_set_xsk_pool(tfile, pool);
> +
> +       return 0;
> +}
> +
> +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
> +{
> +       struct tun_struct *tun = netdev_priv(netdev);
> +       struct tun_file *tfile;
> +
> +       if (qid >= MAX_TAP_QUEUES)
> +               return -EINVAL;
> +
> +       tfile = rtnl_dereference(tun->tfiles[qid]);
> +       if (tfile)
> +               tun_clean_xsk_pool(tfile);
> +       return 0;
> +}
> +
> +static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
> +                             u16 qid)
> +{
> +       return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> +               tun_xsk_pool_disable(dev, qid);
> +}
> +
>  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  {
>         switch (xdp->command) {
>         case XDP_SETUP_PROG:
>                 return tun_xdp_set(dev, xdp->prog, xdp->extack);
> +       case XDP_SETUP_XSK_POOL:
> +               return tun_xsk_pool_setup(dev, xdp->xsk.pool, xdp->xsk.queue_id);
>         default:
>                 return -EINVAL;
>         }
> @@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
>         return nxmit;
>  }
>
> +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> +{
> +       struct tun_struct *tun = netdev_priv(dev);
> +       struct tun_file *tfile;
> +
> +       rcu_read_lock();
> +       tfile = rcu_dereference(tun->tfiles[qid]);
> +       if (tfile)
> +               __tun_xdp_flush_tfile(tfile);
> +       rcu_read_unlock();
> +       return 0;
> +}

I may miss something but why not simply queue xdp frames into ptr ring
then we don't need tricks for peek?

Thanks
wangyunjian March 4, 2024, 11:23 a.m. UTC | #10
> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Monday, March 4, 2024 2:56 PM
> To: wangyunjian <wangyunjian@huawei.com>
> Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> bjorn@kernel.org; magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> jonathan.lemon@gmail.com; davem@davemloft.net; bpf@vger.kernel.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> <liwei395@huawei.com>
> Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> 
> On Wed, Feb 28, 2024 at 7:06 PM Yunjian Wang <wangyunjian@huawei.com>
> wrote:
> >
> > This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> > which can significantly reduce CPU utilization for XDP programs.
> >
> > Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
> > ring has been utilized to queue different types of pointers by
> > encoding the type into the lower bits. Therefore, we introduce a new
> > flag, TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP
> > descriptors and differentiate them from XDP buffers and sk_buffs.
> > Additionally, a spin lock is added for enabling and disabling operations on the
> xsk pool.
> >
> > The performance testing was performed on a Intel E5-2620 2.40GHz
> machine.
> > Traffic were generated/send through TUN(testpmd txonly with AF_XDP) to
> > VM (testpmd rxonly in guest).
> >
> > +------+---------+---------+---------+
> > |      |   copy  |zero-copy| speedup |
> > +------+---------+---------+---------+
> > | UDP  |   Mpps  |   Mpps  |    %    |
> > | 64   |   2.5   |   4.0   |   60%   |
> > | 512  |   2.1   |   3.6   |   71%   |
> > | 1024 |   1.9   |   3.3   |   73%   |
> > +------+---------+---------+---------+
> >
> > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> > ---
> >  drivers/net/tun.c      | 177
> +++++++++++++++++++++++++++++++++++++++--
> >  drivers/vhost/net.c    |   4 +
> >  include/linux/if_tun.h |  32 ++++++++
> >  3 files changed, 208 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c index
> > bc80fc1d576e..7f4ff50b532c 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -63,6 +63,7 @@
> >  #include <net/rtnetlink.h>
> >  #include <net/sock.h>
> >  #include <net/xdp.h>
> > +#include <net/xdp_sock_drv.h>
> >  #include <net/ip_tunnels.h>
> >  #include <linux/seq_file.h>
> >  #include <linux/uio.h>
> > @@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct
> net_device *dev,
> >                                        struct
> ethtool_link_ksettings
> > *cmd);
> >
> >  #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> > +#define TUN_XDP_BATCH 64
> >
> >  /* TUN device flags */
> >
> > @@ -146,6 +148,9 @@ struct tun_file {
> >         struct tun_struct *detached;
> >         struct ptr_ring tx_ring;
> >         struct xdp_rxq_info xdp_rxq;
> > +       struct xsk_buff_pool *xsk_pool;
> > +       spinlock_t pool_lock;   /* Protects xsk pool enable/disable */
> > +       u32 nb_descs;
> >  };
> >
> >  struct tun_page {
> > @@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr)
> >                 struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
> >
> >                 xdp_return_frame(xdpf);
> > +       } else if (tun_is_xdp_desc_frame(ptr)) {
> > +               return;
> >         } else {
> >                 __skb_array_destroy_skb(ptr);
> >         }
> > @@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile)
> >         skb_queue_purge(&tfile->sk.sk_error_queue);
> >  }
> >
> > +static void tun_set_xsk_pool(struct tun_file *tfile, struct
> > +xsk_buff_pool *pool) {
> > +       if (!pool)
> > +               return;
> > +
> > +       spin_lock(&tfile->pool_lock);
> > +       xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> > +       tfile->xsk_pool = pool;
> > +       spin_unlock(&tfile->pool_lock); }
> > +
> > +static void tun_clean_xsk_pool(struct tun_file *tfile) {
> > +       spin_lock(&tfile->pool_lock);
> > +       if (tfile->xsk_pool) {
> > +               void *ptr;
> > +
> > +               while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> > +                       tun_ptr_free(ptr);
> > +
> > +               if (tfile->nb_descs) {
> > +                       xsk_tx_completed(tfile->xsk_pool,
> tfile->nb_descs);
> > +                       if (xsk_uses_need_wakeup(tfile->xsk_pool))
> > +
> xsk_set_tx_need_wakeup(tfile->xsk_pool);
> > +                       tfile->nb_descs = 0;
> > +               }
> > +               tfile->xsk_pool = NULL;
> > +       }
> > +       spin_unlock(&tfile->pool_lock); }
> > +
> >  static void __tun_detach(struct tun_file *tfile, bool clean)  {
> >         struct tun_file *ntfile;
> > @@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool
> clean)
> >                 u16 index = tfile->queue_index;
> >                 BUG_ON(index >= tun->numqueues);
> >
> > +               ntfile = rtnl_dereference(tun->tfiles[tun->numqueues -
> 1]);
> > +               /* Stop xsk zc xmit */
> > +               tun_clean_xsk_pool(tfile);
> > +               tun_clean_xsk_pool(ntfile);
> > +
> >                 rcu_assign_pointer(tun->tfiles[index],
> >                                    tun->tfiles[tun->numqueues - 1]);
> >                 ntfile = rtnl_dereference(tun->tfiles[index]);
> > @@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool
> clean)
> >                 tun_flow_delete_by_queue(tun, tun->numqueues + 1);
> >                 /* Drop read queue */
> >                 tun_queue_purge(tfile);
> > +               tun_set_xsk_pool(ntfile,
> > + xsk_get_pool_from_qid(tun->dev, index));
> >                 tun_set_real_num_queues(tun);
> >         } else if (tfile->detached && clean) {
> >                 tun = tun_enable_queue(tfile); @@ -801,6 +845,7 @@
> > static int tun_attach(struct tun_struct *tun, struct file *file,
> >
> >                 if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
> >                         tfile->xdp_rxq.queue_index =
> > tfile->queue_index;
> > +               tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev,
> > + tfile->queue_index));
> >         } else {
> >                 /* Setup XDP RX-queue info, for new tfile getting
> attached */
> >                 err = xdp_rxq_info_reg(&tfile->xdp_rxq, @@ -1221,11
> > +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog
> *prog,
> >         return 0;
> >  }
> >
> > +static int tun_xsk_pool_enable(struct net_device *netdev,
> > +                              struct xsk_buff_pool *pool,
> > +                              u16 qid) {
> > +       struct tun_struct *tun = netdev_priv(netdev);
> > +       struct tun_file *tfile;
> > +
> > +       if (qid >= tun->numqueues)
> > +               return -EINVAL;
> > +
> > +       tfile = rtnl_dereference(tun->tfiles[qid]);
> > +       tun_set_xsk_pool(tfile, pool);
> > +
> > +       return 0;
> > +}
> > +
> > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) {
> > +       struct tun_struct *tun = netdev_priv(netdev);
> > +       struct tun_file *tfile;
> > +
> > +       if (qid >= MAX_TAP_QUEUES)
> > +               return -EINVAL;
> > +
> > +       tfile = rtnl_dereference(tun->tfiles[qid]);
> > +       if (tfile)
> > +               tun_clean_xsk_pool(tfile);
> > +       return 0;
> > +}
> > +
> > +static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool
> *pool,
> > +                             u16 qid) {
> > +       return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> > +               tun_xsk_pool_disable(dev, qid); }
> > +
> >  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)  {
> >         switch (xdp->command) {
> >         case XDP_SETUP_PROG:
> >                 return tun_xdp_set(dev, xdp->prog, xdp->extack);
> > +       case XDP_SETUP_XSK_POOL:
> > +               return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> > + xdp->xsk.queue_id);
> >         default:
> >                 return -EINVAL;
> >         }
> > @@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev,
> struct xdp_buff *xdp)
> >         return nxmit;
> >  }
> >
> > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> > +{
> > +       struct tun_struct *tun = netdev_priv(dev);
> > +       struct tun_file *tfile;
> > +
> > +       rcu_read_lock();
> > +       tfile = rcu_dereference(tun->tfiles[qid]);
> > +       if (tfile)
> > +               __tun_xdp_flush_tfile(tfile);
> > +       rcu_read_unlock();
> > +       return 0;
> > +}
> 
> I may miss something but why not simply queue xdp frames into ptr ring then
> we don't need tricks for peek?

The current implementation is implemented with reference to other NIC's drivers
(ixgbe, ice, dpaa2, mlx5), which use XDP descriptors directly. Converting XDP
descriptors to XDP frames does not seem to be very beneficial.

Thanks
> 
> Thanks
wangyunjian March 4, 2024, 1:45 p.m. UTC | #11
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:mst@redhat.com]
> Sent: Friday, March 1, 2024 7:53 PM
> To: wangyunjian <wangyunjian@huawei.com>
> Cc: Paolo Abeni <pabeni@redhat.com>; willemdebruijn.kernel@gmail.com;
> jasowang@redhat.com; kuba@kernel.org; bjorn@kernel.org;
> magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> jonathan.lemon@gmail.com; davem@davemloft.net; bpf@vger.kernel.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> <liwei395@huawei.com>
> Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> 
> On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > > -----Original Message-----
> > > From: Paolo Abeni [mailto:pabeni@redhat.com]
> > > Sent: Thursday, February 29, 2024 7:13 PM
> > > To: wangyunjian <wangyunjian@huawei.com>; mst@redhat.com;
> > > willemdebruijn.kernel@gmail.com; jasowang@redhat.com;
> > > kuba@kernel.org; bjorn@kernel.org; magnus.karlsson@intel.com;
> > > maciej.fijalkowski@intel.com; jonathan.lemon@gmail.com;
> > > davem@davemloft.net
> > > Cc: bpf@vger.kernel.org; netdev@vger.kernel.org;
> > > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>;
> > > liwei (DT) <liwei395@huawei.com>
> > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > support
> > >
> > > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > >  	}
> > > >  }
> > > >
> > > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > > +	struct xsk_buff_pool *pool;
> > > > +	u32 i, batch, budget;
> > > > +	void *frame;
> > > > +
> > > > +	if (!ptr_ring_empty(&tfile->tx_ring))
> > > > +		return;
> > > > +
> > > > +	spin_lock(&tfile->pool_lock);
> > > > +	pool = tfile->xsk_pool;
> > > > +	if (!pool) {
> > > > +		spin_unlock(&tfile->pool_lock);
> > > > +		return;
> > > > +	}
> > > > +
> > > > +	if (tfile->nb_descs) {
> > > > +		xsk_tx_completed(pool, tfile->nb_descs);
> > > > +		if (xsk_uses_need_wakeup(pool))
> > > > +			xsk_set_tx_need_wakeup(pool);
> > > > +	}
> > > > +
> > > > +	spin_lock(&tfile->tx_ring.producer_lock);
> > > > +	budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> > > > +
> > > > +	batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > > +	if (!batch) {
> > >
> > > This branch looks like an unneeded "optimization". The generic loop
> > > below should have the same effect with no measurable perf delta - and
> smaller code.
> > > Just remove this.
> > >
> > > > +		tfile->nb_descs = 0;
> > > > +		spin_unlock(&tfile->tx_ring.producer_lock);
> > > > +		spin_unlock(&tfile->pool_lock);
> > > > +		return;
> > > > +	}
> > > > +
> > > > +	tfile->nb_descs = batch;
> > > > +	for (i = 0; i < batch; i++) {
> > > > +		/* Encode the XDP DESC flag into lowest bit for consumer to
> differ
> > > > +		 * XDP desc from XDP buffer and sk_buff.
> > > > +		 */
> > > > +		frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > > +		/* The budget must be less than or equal to tx_ring.size,
> > > > +		 * so enqueuing will not fail.
> > > > +		 */
> > > > +		__ptr_ring_produce(&tfile->tx_ring, frame);
> > > > +	}
> > > > +	spin_unlock(&tfile->tx_ring.producer_lock);
> > > > +	spin_unlock(&tfile->pool_lock);
> > >
> > > More related to the general design: it looks wrong. What if
> > > get_rx_bufs() will fail (ENOBUF) after successful peeking? With no
> > > more incoming packets, later peek will return 0 and it looks like
> > > that the half-processed packets will stay in the ring forever???
> > >
> > > I think the 'ring produce' part should be moved into tun_do_read().
> >
> > Currently, the vhost-net obtains a batch descriptors/sk_buffs from the
> > ptr_ring and enqueue the batch descriptors/sk_buffs to the
> > virtqueue'queue, and then consumes the descriptors/sk_buffs from the
> > virtqueue'queue in sequence. As a result, TUN does not know whether
> > the batch descriptors have been used up, and thus does not know when to
> return the batch descriptors.
> >
> > So, I think it's reasonable that when vhost-net checks ptr_ring is
> > empty, it calls peek_len to get new xsk's descs and return the descriptors.
> >
> > Thanks
> 
> What you need to think about is that if you peek, another call in parallel can get
> the same value at the same time.

Thank you. I have identified a problem. The tx_descs array was created within xsk's pool.
When xsk is freed, the pool and tx_descs are also freed. Howerver, some descs may
remain in the virtqueue'queue, which could lead to a use-after-free scenario. Currently,
I do not have an idea to solve this concurrency problem and believe this scenario may
not be appropriate for reusing the ptr_ring.

Thanks

> 
> 
> > >
> > > Cheers,
> > >
> > > Paolo
> >
Jason Wang March 6, 2024, 2:11 a.m. UTC | #12
On Mon, Mar 4, 2024 at 7:24 PM wangyunjian <wangyunjian@huawei.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Jason Wang [mailto:jasowang@redhat.com]
> > Sent: Monday, March 4, 2024 2:56 PM
> > To: wangyunjian <wangyunjian@huawei.com>
> > Cc: mst@redhat.com; willemdebruijn.kernel@gmail.com; kuba@kernel.org;
> > bjorn@kernel.org; magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> > jonathan.lemon@gmail.com; davem@davemloft.net; bpf@vger.kernel.org;
> > netdev@vger.kernel.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> > <liwei395@huawei.com>
> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> >
> > On Wed, Feb 28, 2024 at 7:06 PM Yunjian Wang <wangyunjian@huawei.com>
> > wrote:
> > >
> > > This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> > > which can significantly reduce CPU utilization for XDP programs.
> > >
> > > Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
> > > ring has been utilized to queue different types of pointers by
> > > encoding the type into the lower bits. Therefore, we introduce a new
> > > flag, TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP
> > > descriptors and differentiate them from XDP buffers and sk_buffs.
> > > Additionally, a spin lock is added for enabling and disabling operations on the
> > xsk pool.
> > >
> > > The performance testing was performed on a Intel E5-2620 2.40GHz
> > machine.
> > > Traffic were generated/send through TUN(testpmd txonly with AF_XDP) to
> > > VM (testpmd rxonly in guest).
> > >
> > > +------+---------+---------+---------+
> > > |      |   copy  |zero-copy| speedup |
> > > +------+---------+---------+---------+
> > > | UDP  |   Mpps  |   Mpps  |    %    |
> > > | 64   |   2.5   |   4.0   |   60%   |
> > > | 512  |   2.1   |   3.6   |   71%   |
> > > | 1024 |   1.9   |   3.3   |   73%   |
> > > +------+---------+---------+---------+
> > >
> > > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> > > ---
> > >  drivers/net/tun.c      | 177
> > +++++++++++++++++++++++++++++++++++++++--
> > >  drivers/vhost/net.c    |   4 +
> > >  include/linux/if_tun.h |  32 ++++++++
> > >  3 files changed, 208 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c index
> > > bc80fc1d576e..7f4ff50b532c 100644
> > > --- a/drivers/net/tun.c
> > > +++ b/drivers/net/tun.c
> > > @@ -63,6 +63,7 @@
> > >  #include <net/rtnetlink.h>
> > >  #include <net/sock.h>
> > >  #include <net/xdp.h>
> > > +#include <net/xdp_sock_drv.h>
> > >  #include <net/ip_tunnels.h>
> > >  #include <linux/seq_file.h>
> > >  #include <linux/uio.h>
> > > @@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct
> > net_device *dev,
> > >                                        struct
> > ethtool_link_ksettings
> > > *cmd);
> > >
> > >  #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> > > +#define TUN_XDP_BATCH 64
> > >
> > >  /* TUN device flags */
> > >
> > > @@ -146,6 +148,9 @@ struct tun_file {
> > >         struct tun_struct *detached;
> > >         struct ptr_ring tx_ring;
> > >         struct xdp_rxq_info xdp_rxq;
> > > +       struct xsk_buff_pool *xsk_pool;
> > > +       spinlock_t pool_lock;   /* Protects xsk pool enable/disable */
> > > +       u32 nb_descs;
> > >  };
> > >
> > >  struct tun_page {
> > > @@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr)
> > >                 struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
> > >
> > >                 xdp_return_frame(xdpf);
> > > +       } else if (tun_is_xdp_desc_frame(ptr)) {
> > > +               return;
> > >         } else {
> > >                 __skb_array_destroy_skb(ptr);
> > >         }
> > > @@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile)
> > >         skb_queue_purge(&tfile->sk.sk_error_queue);
> > >  }
> > >
> > > +static void tun_set_xsk_pool(struct tun_file *tfile, struct
> > > +xsk_buff_pool *pool) {
> > > +       if (!pool)
> > > +               return;
> > > +
> > > +       spin_lock(&tfile->pool_lock);
> > > +       xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> > > +       tfile->xsk_pool = pool;
> > > +       spin_unlock(&tfile->pool_lock); }
> > > +
> > > +static void tun_clean_xsk_pool(struct tun_file *tfile) {
> > > +       spin_lock(&tfile->pool_lock);
> > > +       if (tfile->xsk_pool) {
> > > +               void *ptr;
> > > +
> > > +               while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> > > +                       tun_ptr_free(ptr);
> > > +
> > > +               if (tfile->nb_descs) {
> > > +                       xsk_tx_completed(tfile->xsk_pool,
> > tfile->nb_descs);
> > > +                       if (xsk_uses_need_wakeup(tfile->xsk_pool))
> > > +
> > xsk_set_tx_need_wakeup(tfile->xsk_pool);
> > > +                       tfile->nb_descs = 0;
> > > +               }
> > > +               tfile->xsk_pool = NULL;
> > > +       }
> > > +       spin_unlock(&tfile->pool_lock); }
> > > +
> > >  static void __tun_detach(struct tun_file *tfile, bool clean)  {
> > >         struct tun_file *ntfile;
> > > @@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool
> > clean)
> > >                 u16 index = tfile->queue_index;
> > >                 BUG_ON(index >= tun->numqueues);
> > >
> > > +               ntfile = rtnl_dereference(tun->tfiles[tun->numqueues -
> > 1]);
> > > +               /* Stop xsk zc xmit */
> > > +               tun_clean_xsk_pool(tfile);
> > > +               tun_clean_xsk_pool(ntfile);
> > > +
> > >                 rcu_assign_pointer(tun->tfiles[index],
> > >                                    tun->tfiles[tun->numqueues - 1]);
> > >                 ntfile = rtnl_dereference(tun->tfiles[index]);
> > > @@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool
> > clean)
> > >                 tun_flow_delete_by_queue(tun, tun->numqueues + 1);
> > >                 /* Drop read queue */
> > >                 tun_queue_purge(tfile);
> > > +               tun_set_xsk_pool(ntfile,
> > > + xsk_get_pool_from_qid(tun->dev, index));
> > >                 tun_set_real_num_queues(tun);
> > >         } else if (tfile->detached && clean) {
> > >                 tun = tun_enable_queue(tfile); @@ -801,6 +845,7 @@
> > > static int tun_attach(struct tun_struct *tun, struct file *file,
> > >
> > >                 if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
> > >                         tfile->xdp_rxq.queue_index =
> > > tfile->queue_index;
> > > +               tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev,
> > > + tfile->queue_index));
> > >         } else {
> > >                 /* Setup XDP RX-queue info, for new tfile getting
> > attached */
> > >                 err = xdp_rxq_info_reg(&tfile->xdp_rxq, @@ -1221,11
> > > +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog
> > *prog,
> > >         return 0;
> > >  }
> > >
> > > +static int tun_xsk_pool_enable(struct net_device *netdev,
> > > +                              struct xsk_buff_pool *pool,
> > > +                              u16 qid) {
> > > +       struct tun_struct *tun = netdev_priv(netdev);
> > > +       struct tun_file *tfile;
> > > +
> > > +       if (qid >= tun->numqueues)
> > > +               return -EINVAL;
> > > +
> > > +       tfile = rtnl_dereference(tun->tfiles[qid]);
> > > +       tun_set_xsk_pool(tfile, pool);
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) {
> > > +       struct tun_struct *tun = netdev_priv(netdev);
> > > +       struct tun_file *tfile;
> > > +
> > > +       if (qid >= MAX_TAP_QUEUES)
> > > +               return -EINVAL;
> > > +
> > > +       tfile = rtnl_dereference(tun->tfiles[qid]);
> > > +       if (tfile)
> > > +               tun_clean_xsk_pool(tfile);
> > > +       return 0;
> > > +}
> > > +
> > > +static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool
> > *pool,
> > > +                             u16 qid) {
> > > +       return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> > > +               tun_xsk_pool_disable(dev, qid); }
> > > +
> > >  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)  {
> > >         switch (xdp->command) {
> > >         case XDP_SETUP_PROG:
> > >                 return tun_xdp_set(dev, xdp->prog, xdp->extack);
> > > +       case XDP_SETUP_XSK_POOL:
> > > +               return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> > > + xdp->xsk.queue_id);
> > >         default:
> > >                 return -EINVAL;
> > >         }
> > > @@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev,
> > struct xdp_buff *xdp)
> > >         return nxmit;
> > >  }
> > >
> > > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> > > +{
> > > +       struct tun_struct *tun = netdev_priv(dev);
> > > +       struct tun_file *tfile;
> > > +
> > > +       rcu_read_lock();
> > > +       tfile = rcu_dereference(tun->tfiles[qid]);
> > > +       if (tfile)
> > > +               __tun_xdp_flush_tfile(tfile);
> > > +       rcu_read_unlock();
> > > +       return 0;
> > > +}
> >
> > I may miss something but why not simply queue xdp frames into ptr ring then
> > we don't need tricks for peek?
>
> The current implementation is implemented with reference to other NIC's drivers
> (ixgbe, ice, dpaa2, mlx5), which use XDP descriptors directly.

Well, they all do the same thing which is translate XDP descriptors to
the vendor specific descriptor format.

For TUN, the ptr_ring is the "vendor specific" format.

> Converting XDP
> descriptors to XDP frames does not seem to be very beneficial.

Get rid of the trick for peek like what is done in this patch?

Thanks

>
> Thanks
> >
> > Thanks
>
Jason Wang March 6, 2024, 5:32 a.m. UTC | #13
On Sat, Mar 2, 2024 at 2:40 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Maciej Fijalkowski wrote:
> > On Wed, Feb 28, 2024 at 07:05:56PM +0800, Yunjian Wang wrote:
> > > This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> > > which can significantly reduce CPU utilization for XDP programs.
> >
> > Why no Rx ZC support though? What will happen if I try rxdrop xdpsock
> > against tun with this patch? You clearly allow for that.
>
> This is AF_XDP receive zerocopy, right?
>
> The naming is always confusing with tun, but even though from a tun
> PoV this happens on ndo_start_xmit, it is the AF_XDP equivalent to
> tun_put_user.
>
> So the implementation is more like other device's Rx ZC.
>
> I would have preferred that name, but I think Jason asked for this
> and given tun's weird status, there is something bo said for either.
>

From the the view of the AF_XDP userspace program, it's the TX path,
and as you said it happens on the TUN xmit path as well. When using
with a VM, it's the RX path.

So TX seems better.

Thanks
Jason Wang March 11, 2024, 4 a.m. UTC | #14
On Mon, Mar 4, 2024 at 9:45 PM wangyunjian <wangyunjian@huawei.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Michael S. Tsirkin [mailto:mst@redhat.com]
> > Sent: Friday, March 1, 2024 7:53 PM
> > To: wangyunjian <wangyunjian@huawei.com>
> > Cc: Paolo Abeni <pabeni@redhat.com>; willemdebruijn.kernel@gmail.com;
> > jasowang@redhat.com; kuba@kernel.org; bjorn@kernel.org;
> > magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> > jonathan.lemon@gmail.com; davem@davemloft.net; bpf@vger.kernel.org;
> > netdev@vger.kernel.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> > <liwei395@huawei.com>
> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> >
> > On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > > > -----Original Message-----
> > > > From: Paolo Abeni [mailto:pabeni@redhat.com]
> > > > Sent: Thursday, February 29, 2024 7:13 PM
> > > > To: wangyunjian <wangyunjian@huawei.com>; mst@redhat.com;
> > > > willemdebruijn.kernel@gmail.com; jasowang@redhat.com;
> > > > kuba@kernel.org; bjorn@kernel.org; magnus.karlsson@intel.com;
> > > > maciej.fijalkowski@intel.com; jonathan.lemon@gmail.com;
> > > > davem@davemloft.net
> > > > Cc: bpf@vger.kernel.org; netdev@vger.kernel.org;
> > > > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > > > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>;
> > > > liwei (DT) <liwei395@huawei.com>
> > > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > > support
> > > >
> > > > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > > >         }
> > > > >  }
> > > > >
> > > > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > > > +       struct xsk_buff_pool *pool;
> > > > > +       u32 i, batch, budget;
> > > > > +       void *frame;
> > > > > +
> > > > > +       if (!ptr_ring_empty(&tfile->tx_ring))
> > > > > +               return;
> > > > > +
> > > > > +       spin_lock(&tfile->pool_lock);
> > > > > +       pool = tfile->xsk_pool;
> > > > > +       if (!pool) {
> > > > > +               spin_unlock(&tfile->pool_lock);
> > > > > +               return;
> > > > > +       }
> > > > > +
> > > > > +       if (tfile->nb_descs) {
> > > > > +               xsk_tx_completed(pool, tfile->nb_descs);
> > > > > +               if (xsk_uses_need_wakeup(pool))
> > > > > +                       xsk_set_tx_need_wakeup(pool);
> > > > > +       }
> > > > > +
> > > > > +       spin_lock(&tfile->tx_ring.producer_lock);
> > > > > +       budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> > > > > +
> > > > > +       batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > > > +       if (!batch) {
> > > >
> > > > This branch looks like an unneeded "optimization". The generic loop
> > > > below should have the same effect with no measurable perf delta - and
> > smaller code.
> > > > Just remove this.
> > > >
> > > > > +               tfile->nb_descs = 0;
> > > > > +               spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > +               spin_unlock(&tfile->pool_lock);
> > > > > +               return;
> > > > > +       }
> > > > > +
> > > > > +       tfile->nb_descs = batch;
> > > > > +       for (i = 0; i < batch; i++) {
> > > > > +               /* Encode the XDP DESC flag into lowest bit for consumer to
> > differ
> > > > > +                * XDP desc from XDP buffer and sk_buff.
> > > > > +                */
> > > > > +               frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > > > +               /* The budget must be less than or equal to tx_ring.size,
> > > > > +                * so enqueuing will not fail.
> > > > > +                */
> > > > > +               __ptr_ring_produce(&tfile->tx_ring, frame);
> > > > > +       }
> > > > > +       spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > +       spin_unlock(&tfile->pool_lock);
> > > >
> > > > More related to the general design: it looks wrong. What if
> > > > get_rx_bufs() will fail (ENOBUF) after successful peeking? With no
> > > > more incoming packets, later peek will return 0 and it looks like
> > > > that the half-processed packets will stay in the ring forever???
> > > >
> > > > I think the 'ring produce' part should be moved into tun_do_read().
> > >
> > > Currently, the vhost-net obtains a batch descriptors/sk_buffs from the
> > > ptr_ring and enqueue the batch descriptors/sk_buffs to the
> > > virtqueue'queue, and then consumes the descriptors/sk_buffs from the
> > > virtqueue'queue in sequence. As a result, TUN does not know whether
> > > the batch descriptors have been used up, and thus does not know when to
> > return the batch descriptors.
> > >
> > > So, I think it's reasonable that when vhost-net checks ptr_ring is
> > > empty, it calls peek_len to get new xsk's descs and return the descriptors.
> > >
> > > Thanks
> >
> > What you need to think about is that if you peek, another call in parallel can get
> > the same value at the same time.
>
> Thank you. I have identified a problem. The tx_descs array was created within xsk's pool.
> When xsk is freed, the pool and tx_descs are also freed. Howerver, some descs may
> remain in the virtqueue'queue, which could lead to a use-after-free scenario.

This can probably solving by when xsk pool is disabled, signal the
vhost_net to drop those descriptors.

Thanks

> Currently,
> I do not have an idea to solve this concurrency problem and believe this scenario may
> not be appropriate for reusing the ptr_ring.
>
> Thanks
>
> >
> >
> > > >
> > > > Cheers,
> > > >
> > > > Paolo
> > >
>
wangyunjian March 11, 2024, 1:27 p.m. UTC | #15
> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Monday, March 11, 2024 12:01 PM
> To: wangyunjian <wangyunjian@huawei.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>; Paolo Abeni <pabeni@redhat.com>;
> willemdebruijn.kernel@gmail.com; kuba@kernel.org; bjorn@kernel.org;
> magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> jonathan.lemon@gmail.com; davem@davemloft.net; bpf@vger.kernel.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> <liwei395@huawei.com>
> Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> 
> On Mon, Mar 4, 2024 at 9:45 PM wangyunjian <wangyunjian@huawei.com>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Michael S. Tsirkin [mailto:mst@redhat.com]
> > > Sent: Friday, March 1, 2024 7:53 PM
> > > To: wangyunjian <wangyunjian@huawei.com>
> > > Cc: Paolo Abeni <pabeni@redhat.com>;
> > > willemdebruijn.kernel@gmail.com; jasowang@redhat.com;
> > > kuba@kernel.org; bjorn@kernel.org; magnus.karlsson@intel.com;
> > > maciej.fijalkowski@intel.com; jonathan.lemon@gmail.com;
> > > davem@davemloft.net; bpf@vger.kernel.org; netdev@vger.kernel.org;
> > > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>;
> > > liwei (DT) <liwei395@huawei.com>
> > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > support
> > >
> > > On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > > > > -----Original Message-----
> > > > > From: Paolo Abeni [mailto:pabeni@redhat.com]
> > > > > Sent: Thursday, February 29, 2024 7:13 PM
> > > > > To: wangyunjian <wangyunjian@huawei.com>; mst@redhat.com;
> > > > > willemdebruijn.kernel@gmail.com; jasowang@redhat.com;
> > > > > kuba@kernel.org; bjorn@kernel.org; magnus.karlsson@intel.com;
> > > > > maciej.fijalkowski@intel.com; jonathan.lemon@gmail.com;
> > > > > davem@davemloft.net
> > > > > Cc: bpf@vger.kernel.org; netdev@vger.kernel.org;
> > > > > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > > > > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>;
> > > > > liwei (DT) <liwei395@huawei.com>
> > > > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > > > support
> > > > >
> > > > > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > > > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > > > >         }
> > > > > >  }
> > > > > >
> > > > > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > > > > +       struct xsk_buff_pool *pool;
> > > > > > +       u32 i, batch, budget;
> > > > > > +       void *frame;
> > > > > > +
> > > > > > +       if (!ptr_ring_empty(&tfile->tx_ring))
> > > > > > +               return;
> > > > > > +
> > > > > > +       spin_lock(&tfile->pool_lock);
> > > > > > +       pool = tfile->xsk_pool;
> > > > > > +       if (!pool) {
> > > > > > +               spin_unlock(&tfile->pool_lock);
> > > > > > +               return;
> > > > > > +       }
> > > > > > +
> > > > > > +       if (tfile->nb_descs) {
> > > > > > +               xsk_tx_completed(pool, tfile->nb_descs);
> > > > > > +               if (xsk_uses_need_wakeup(pool))
> > > > > > +                       xsk_set_tx_need_wakeup(pool);
> > > > > > +       }
> > > > > > +
> > > > > > +       spin_lock(&tfile->tx_ring.producer_lock);
> > > > > > +       budget = min_t(u32, tfile->tx_ring.size,
> > > > > > + TUN_XDP_BATCH);
> > > > > > +
> > > > > > +       batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > > > > +       if (!batch) {
> > > > >
> > > > > This branch looks like an unneeded "optimization". The generic
> > > > > loop below should have the same effect with no measurable perf
> > > > > delta - and
> > > smaller code.
> > > > > Just remove this.
> > > > >
> > > > > > +               tfile->nb_descs = 0;
> > > > > > +               spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > > +               spin_unlock(&tfile->pool_lock);
> > > > > > +               return;
> > > > > > +       }
> > > > > > +
> > > > > > +       tfile->nb_descs = batch;
> > > > > > +       for (i = 0; i < batch; i++) {
> > > > > > +               /* Encode the XDP DESC flag into lowest bit
> > > > > > + for consumer to
> > > differ
> > > > > > +                * XDP desc from XDP buffer and sk_buff.
> > > > > > +                */
> > > > > > +               frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > > > > +               /* The budget must be less than or equal to
> tx_ring.size,
> > > > > > +                * so enqueuing will not fail.
> > > > > > +                */
> > > > > > +               __ptr_ring_produce(&tfile->tx_ring, frame);
> > > > > > +       }
> > > > > > +       spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > > +       spin_unlock(&tfile->pool_lock);
> > > > >
> > > > > More related to the general design: it looks wrong. What if
> > > > > get_rx_bufs() will fail (ENOBUF) after successful peeking? With
> > > > > no more incoming packets, later peek will return 0 and it looks
> > > > > like that the half-processed packets will stay in the ring forever???
> > > > >
> > > > > I think the 'ring produce' part should be moved into tun_do_read().
> > > >
> > > > Currently, the vhost-net obtains a batch descriptors/sk_buffs from
> > > > the ptr_ring and enqueue the batch descriptors/sk_buffs to the
> > > > virtqueue'queue, and then consumes the descriptors/sk_buffs from
> > > > the virtqueue'queue in sequence. As a result, TUN does not know
> > > > whether the batch descriptors have been used up, and thus does not
> > > > know when to
> > > return the batch descriptors.
> > > >
> > > > So, I think it's reasonable that when vhost-net checks ptr_ring is
> > > > empty, it calls peek_len to get new xsk's descs and return the descriptors.
> > > >
> > > > Thanks
> > >
> > > What you need to think about is that if you peek, another call in
> > > parallel can get the same value at the same time.
> >
> > Thank you. I have identified a problem. The tx_descs array was created within
> xsk's pool.
> > When xsk is freed, the pool and tx_descs are also freed. Howerver,
> > some descs may remain in the virtqueue'queue, which could lead to a
> use-after-free scenario.
> 
> This can probably solving by when xsk pool is disabled, signal the vhost_net to
> drop those descriptors.

I think TUN can notify vhost_net to drop these descriptors through netdev events.
However, there is a potential concurrency problem. When handling netdev events
and packets, vhost_net preempts the 'vq->mutex_lock', leading to unstable performance.

Thanks
> 
> Thanks
> 
> > Currently,
> > I do not have an idea to solve this concurrency problem and believe
> > this scenario may not be appropriate for reusing the ptr_ring.
> >
> > Thanks
> >
> > >
> > >
> > > > >
> > > > > Cheers,
> > > > >
> > > > > Paolo
> > > >
> >
Jason Wang March 12, 2024, 6:07 a.m. UTC | #16
On Mon, Mar 11, 2024 at 9:28 PM wangyunjian <wangyunjian@huawei.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Jason Wang [mailto:jasowang@redhat.com]
> > Sent: Monday, March 11, 2024 12:01 PM
> > To: wangyunjian <wangyunjian@huawei.com>
> > Cc: Michael S. Tsirkin <mst@redhat.com>; Paolo Abeni <pabeni@redhat.com>;
> > willemdebruijn.kernel@gmail.com; kuba@kernel.org; bjorn@kernel.org;
> > magnus.karlsson@intel.com; maciej.fijalkowski@intel.com;
> > jonathan.lemon@gmail.com; davem@davemloft.net; bpf@vger.kernel.org;
> > netdev@vger.kernel.org; linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>; liwei (DT)
> > <liwei395@huawei.com>
> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> >
> > On Mon, Mar 4, 2024 at 9:45 PM wangyunjian <wangyunjian@huawei.com>
> > wrote:
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Michael S. Tsirkin [mailto:mst@redhat.com]
> > > > Sent: Friday, March 1, 2024 7:53 PM
> > > > To: wangyunjian <wangyunjian@huawei.com>
> > > > Cc: Paolo Abeni <pabeni@redhat.com>;
> > > > willemdebruijn.kernel@gmail.com; jasowang@redhat.com;
> > > > kuba@kernel.org; bjorn@kernel.org; magnus.karlsson@intel.com;
> > > > maciej.fijalkowski@intel.com; jonathan.lemon@gmail.com;
> > > > davem@davemloft.net; bpf@vger.kernel.org; netdev@vger.kernel.org;
> > > > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > > > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>;
> > > > liwei (DT) <liwei395@huawei.com>
> > > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > > support
> > > >
> > > > On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > > > > > -----Original Message-----
> > > > > > From: Paolo Abeni [mailto:pabeni@redhat.com]
> > > > > > Sent: Thursday, February 29, 2024 7:13 PM
> > > > > > To: wangyunjian <wangyunjian@huawei.com>; mst@redhat.com;
> > > > > > willemdebruijn.kernel@gmail.com; jasowang@redhat.com;
> > > > > > kuba@kernel.org; bjorn@kernel.org; magnus.karlsson@intel.com;
> > > > > > maciej.fijalkowski@intel.com; jonathan.lemon@gmail.com;
> > > > > > davem@davemloft.net
> > > > > > Cc: bpf@vger.kernel.org; netdev@vger.kernel.org;
> > > > > > linux-kernel@vger.kernel.org; kvm@vger.kernel.org;
> > > > > > virtualization@lists.linux.dev; xudingke <xudingke@huawei.com>;
> > > > > > liwei (DT) <liwei395@huawei.com>
> > > > > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > > > > support
> > > > > >
> > > > > > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > > > > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > > > > >         }
> > > > > > >  }
> > > > > > >
> > > > > > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > > > > > +       struct xsk_buff_pool *pool;
> > > > > > > +       u32 i, batch, budget;
> > > > > > > +       void *frame;
> > > > > > > +
> > > > > > > +       if (!ptr_ring_empty(&tfile->tx_ring))
> > > > > > > +               return;
> > > > > > > +
> > > > > > > +       spin_lock(&tfile->pool_lock);
> > > > > > > +       pool = tfile->xsk_pool;
> > > > > > > +       if (!pool) {
> > > > > > > +               spin_unlock(&tfile->pool_lock);
> > > > > > > +               return;
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       if (tfile->nb_descs) {
> > > > > > > +               xsk_tx_completed(pool, tfile->nb_descs);
> > > > > > > +               if (xsk_uses_need_wakeup(pool))
> > > > > > > +                       xsk_set_tx_need_wakeup(pool);
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       spin_lock(&tfile->tx_ring.producer_lock);
> > > > > > > +       budget = min_t(u32, tfile->tx_ring.size,
> > > > > > > + TUN_XDP_BATCH);
> > > > > > > +
> > > > > > > +       batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > > > > > +       if (!batch) {
> > > > > >
> > > > > > This branch looks like an unneeded "optimization". The generic
> > > > > > loop below should have the same effect with no measurable perf
> > > > > > delta - and
> > > > smaller code.
> > > > > > Just remove this.
> > > > > >
> > > > > > > +               tfile->nb_descs = 0;
> > > > > > > +               spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > > > +               spin_unlock(&tfile->pool_lock);
> > > > > > > +               return;
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       tfile->nb_descs = batch;
> > > > > > > +       for (i = 0; i < batch; i++) {
> > > > > > > +               /* Encode the XDP DESC flag into lowest bit
> > > > > > > + for consumer to
> > > > differ
> > > > > > > +                * XDP desc from XDP buffer and sk_buff.
> > > > > > > +                */
> > > > > > > +               frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > > > > > +               /* The budget must be less than or equal to
> > tx_ring.size,
> > > > > > > +                * so enqueuing will not fail.
> > > > > > > +                */
> > > > > > > +               __ptr_ring_produce(&tfile->tx_ring, frame);
> > > > > > > +       }
> > > > > > > +       spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > > > +       spin_unlock(&tfile->pool_lock);
> > > > > >
> > > > > > More related to the general design: it looks wrong. What if
> > > > > > get_rx_bufs() will fail (ENOBUF) after successful peeking? With
> > > > > > no more incoming packets, later peek will return 0 and it looks
> > > > > > like that the half-processed packets will stay in the ring forever???
> > > > > >
> > > > > > I think the 'ring produce' part should be moved into tun_do_read().
> > > > >
> > > > > Currently, the vhost-net obtains a batch descriptors/sk_buffs from
> > > > > the ptr_ring and enqueue the batch descriptors/sk_buffs to the
> > > > > virtqueue'queue, and then consumes the descriptors/sk_buffs from
> > > > > the virtqueue'queue in sequence. As a result, TUN does not know
> > > > > whether the batch descriptors have been used up, and thus does not
> > > > > know when to
> > > > return the batch descriptors.
> > > > >
> > > > > So, I think it's reasonable that when vhost-net checks ptr_ring is
> > > > > empty, it calls peek_len to get new xsk's descs and return the descriptors.
> > > > >
> > > > > Thanks
> > > >
> > > > What you need to think about is that if you peek, another call in
> > > > parallel can get the same value at the same time.
> > >
> > > Thank you. I have identified a problem. The tx_descs array was created within
> > xsk's pool.
> > > When xsk is freed, the pool and tx_descs are also freed. Howerver,
> > > some descs may remain in the virtqueue'queue, which could lead to a
> > use-after-free scenario.
> >
> > This can probably solving by when xsk pool is disabled, signal the vhost_net to
> > drop those descriptors.
>
> I think TUN can notify vhost_net to drop these descriptors through netdev events.

Great, actually, the "issue" described above exist in this patch as
well. For example, you did:

                        spin_lock(&tfile->pool_lock);
                        if (tfile->pool) {
                              ret = tun_put_user_desc(tun, tfile,
&tfile->desc, to);

You did copy_to_user() under spinlock which is actually a bug.

> However, there is a potential concurrency problem. When handling netdev events
> and packets, vhost_net preempts the 'vq->mutex_lock', leading to unstable performance.

I think we don't need to care the perf in this case.

And we gain a lot:

1) no trick in peek
2) batching support
...

Thanks

>
> Thanks
> >
> > Thanks
> >
> > > Currently,
> > > I do not have an idea to solve this concurrency problem and believe
> > > this scenario may not be appropriate for reusing the ptr_ring.
> > >
> > > Thanks
> > >
> > > >
> > > >
> > > > > >
> > > > > > Cheers,
> > > > > >
> > > > > > Paolo
> > > > >
> > >
>
diff mbox series

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bc80fc1d576e..7f4ff50b532c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -63,6 +63,7 @@ 
 #include <net/rtnetlink.h>
 #include <net/sock.h>
 #include <net/xdp.h>
+#include <net/xdp_sock_drv.h>
 #include <net/ip_tunnels.h>
 #include <linux/seq_file.h>
 #include <linux/uio.h>
@@ -86,6 +87,7 @@  static void tun_default_link_ksettings(struct net_device *dev,
 				       struct ethtool_link_ksettings *cmd);
 
 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+#define TUN_XDP_BATCH 64
 
 /* TUN device flags */
 
@@ -146,6 +148,9 @@  struct tun_file {
 	struct tun_struct *detached;
 	struct ptr_ring tx_ring;
 	struct xdp_rxq_info xdp_rxq;
+	struct xsk_buff_pool *xsk_pool;
+	spinlock_t pool_lock;	/* Protects xsk pool enable/disable */
+	u32 nb_descs;
 };
 
 struct tun_page {
@@ -614,6 +619,8 @@  void tun_ptr_free(void *ptr)
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
 		xdp_return_frame(xdpf);
+	} else if (tun_is_xdp_desc_frame(ptr)) {
+		return;
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -631,6 +638,37 @@  static void tun_queue_purge(struct tun_file *tfile)
 	skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
+static void tun_set_xsk_pool(struct tun_file *tfile, struct xsk_buff_pool *pool)
+{
+	if (!pool)
+		return;
+
+	spin_lock(&tfile->pool_lock);
+	xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
+	tfile->xsk_pool = pool;
+	spin_unlock(&tfile->pool_lock);
+}
+
+static void tun_clean_xsk_pool(struct tun_file *tfile)
+{
+	spin_lock(&tfile->pool_lock);
+	if (tfile->xsk_pool) {
+		void *ptr;
+
+		while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
+			tun_ptr_free(ptr);
+
+		if (tfile->nb_descs) {
+			xsk_tx_completed(tfile->xsk_pool, tfile->nb_descs);
+			if (xsk_uses_need_wakeup(tfile->xsk_pool))
+				xsk_set_tx_need_wakeup(tfile->xsk_pool);
+			tfile->nb_descs = 0;
+		}
+		tfile->xsk_pool = NULL;
+	}
+	spin_unlock(&tfile->pool_lock);
+}
+
 static void __tun_detach(struct tun_file *tfile, bool clean)
 {
 	struct tun_file *ntfile;
@@ -648,6 +686,11 @@  static void __tun_detach(struct tun_file *tfile, bool clean)
 		u16 index = tfile->queue_index;
 		BUG_ON(index >= tun->numqueues);
 
+		ntfile = rtnl_dereference(tun->tfiles[tun->numqueues - 1]);
+		/* Stop xsk zc xmit */
+		tun_clean_xsk_pool(tfile);
+		tun_clean_xsk_pool(ntfile);
+
 		rcu_assign_pointer(tun->tfiles[index],
 				   tun->tfiles[tun->numqueues - 1]);
 		ntfile = rtnl_dereference(tun->tfiles[index]);
@@ -668,6 +711,7 @@  static void __tun_detach(struct tun_file *tfile, bool clean)
 		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
+		tun_set_xsk_pool(ntfile, xsk_get_pool_from_qid(tun->dev, index));
 		tun_set_real_num_queues(tun);
 	} else if (tfile->detached && clean) {
 		tun = tun_enable_queue(tfile);
@@ -801,6 +845,7 @@  static int tun_attach(struct tun_struct *tun, struct file *file,
 
 		if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
 			tfile->xdp_rxq.queue_index = tfile->queue_index;
+		tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev, tfile->queue_index));
 	} else {
 		/* Setup XDP RX-queue info, for new tfile getting attached */
 		err = xdp_rxq_info_reg(&tfile->xdp_rxq,
@@ -1221,11 +1266,50 @@  static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	return 0;
 }
 
+static int tun_xsk_pool_enable(struct net_device *netdev,
+			       struct xsk_buff_pool *pool,
+			       u16 qid)
+{
+	struct tun_struct *tun = netdev_priv(netdev);
+	struct tun_file *tfile;
+
+	if (qid >= tun->numqueues)
+		return -EINVAL;
+
+	tfile = rtnl_dereference(tun->tfiles[qid]);
+	tun_set_xsk_pool(tfile, pool);
+
+	return 0;
+}
+
+static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
+{
+	struct tun_struct *tun = netdev_priv(netdev);
+	struct tun_file *tfile;
+
+	if (qid >= MAX_TAP_QUEUES)
+		return -EINVAL;
+
+	tfile = rtnl_dereference(tun->tfiles[qid]);
+	if (tfile)
+		tun_clean_xsk_pool(tfile);
+	return 0;
+}
+
+static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
+			      u16 qid)
+{
+	return pool ? tun_xsk_pool_enable(dev, pool, qid) :
+		tun_xsk_pool_disable(dev, qid);
+}
+
 static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return tun_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_SETUP_XSK_POOL:
+		return tun_xsk_pool_setup(dev, xdp->xsk.pool, xdp->xsk.queue_id);
 	default:
 		return -EINVAL;
 	}
@@ -1330,6 +1414,19 @@  static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	return nxmit;
 }
 
+static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
+
+	rcu_read_lock();
+	tfile = rcu_dereference(tun->tfiles[qid]);
+	if (tfile)
+		__tun_xdp_flush_tfile(tfile);
+	rcu_read_unlock();
+	return 0;
+}
+
 static const struct net_device_ops tap_netdev_ops = {
 	.ndo_init		= tun_net_init,
 	.ndo_uninit		= tun_net_uninit,
@@ -1346,6 +1443,7 @@  static const struct net_device_ops tap_netdev_ops = {
 	.ndo_get_stats64	= dev_get_tstats64,
 	.ndo_bpf		= tun_xdp,
 	.ndo_xdp_xmit		= tun_xdp_xmit,
+	.ndo_xsk_wakeup		= tun_xsk_wakeup,
 	.ndo_change_carrier	= tun_net_change_carrier,
 };
 
@@ -1403,7 +1501,8 @@  static void tun_net_initialize(struct net_device *dev)
 		/* Currently tun does not support XDP, only tap does. */
 		dev->xdp_features = NETDEV_XDP_ACT_BASIC |
 				    NETDEV_XDP_ACT_REDIRECT |
-				    NETDEV_XDP_ACT_NDO_XMIT;
+				    NETDEV_XDP_ACT_NDO_XMIT |
+				    NETDEV_XDP_ACT_XSK_ZEROCOPY;
 
 		break;
 	}
@@ -2058,11 +2157,11 @@  static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 				struct tun_file *tfile,
-				struct xdp_frame *xdp_frame,
+				void *addr,
+				size_t size,
 				struct iov_iter *iter)
 {
 	int vnet_hdr_sz = 0;
-	size_t size = xdp_frame->len;
 	size_t ret;
 
 	if (tun->flags & IFF_VNET_HDR) {
@@ -2077,7 +2176,7 @@  static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
 	}
 
-	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
+	ret = copy_to_iter(addr, size, iter) + vnet_hdr_sz;
 
 	preempt_disable();
 	dev_sw_netstats_tx_add(tun->dev, 1, ret);
@@ -2240,8 +2339,20 @@  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	if (tun_is_xdp_frame(ptr)) {
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
+		ret = tun_put_user_xdp(tun, tfile, xdpf->data, xdpf->len, to);
 		xdp_return_frame(xdpf);
+	} else if (tun_is_xdp_desc_frame(ptr)) {
+		struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+		void *data;
+
+		spin_lock(&tfile->pool_lock);
+		if (tfile->xsk_pool) {
+			data = xsk_buff_raw_get_data(tfile->xsk_pool, desc->addr);
+			ret = tun_put_user_xdp(tun, tfile, data, desc->len, to);
+		} else {
+			ret = 0;
+		}
+		spin_unlock(&tfile->pool_lock);
 	} else {
 		struct sk_buff *skb = ptr;
 
@@ -2654,6 +2765,10 @@  static int tun_ptr_peek_len(void *ptr)
 			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
 			return xdpf->len;
+		} else if (tun_is_xdp_desc_frame(ptr)) {
+			struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+
+			return desc->len;
 		}
 		return __skb_array_len_with_tag(ptr);
 	} else {
@@ -2661,6 +2776,54 @@  static int tun_ptr_peek_len(void *ptr)
 	}
 }
 
+static void tun_peek_xsk(struct tun_file *tfile)
+{
+	struct xsk_buff_pool *pool;
+	u32 i, batch, budget;
+	void *frame;
+
+	if (!ptr_ring_empty(&tfile->tx_ring))
+		return;
+
+	spin_lock(&tfile->pool_lock);
+	pool = tfile->xsk_pool;
+	if (!pool) {
+		spin_unlock(&tfile->pool_lock);
+		return;
+	}
+
+	if (tfile->nb_descs) {
+		xsk_tx_completed(pool, tfile->nb_descs);
+		if (xsk_uses_need_wakeup(pool))
+			xsk_set_tx_need_wakeup(pool);
+	}
+
+	spin_lock(&tfile->tx_ring.producer_lock);
+	budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
+
+	batch = xsk_tx_peek_release_desc_batch(pool, budget);
+	if (!batch) {
+		tfile->nb_descs = 0;
+		spin_unlock(&tfile->tx_ring.producer_lock);
+		spin_unlock(&tfile->pool_lock);
+		return;
+	}
+
+	tfile->nb_descs = batch;
+	for (i = 0; i < batch; i++) {
+		/* Encode the XDP DESC flag into lowest bit for consumer to differ
+		 * XDP desc from XDP buffer and sk_buff.
+		 */
+		frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
+		/* The budget must be less than or equal to tx_ring.size,
+		 * so enqueuing will not fail.
+		 */
+		__ptr_ring_produce(&tfile->tx_ring, frame);
+	}
+	spin_unlock(&tfile->tx_ring.producer_lock);
+	spin_unlock(&tfile->pool_lock);
+}
+
 static int tun_peek_len(struct socket *sock)
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
@@ -2671,6 +2834,9 @@  static int tun_peek_len(struct socket *sock)
 	if (!tun)
 		return 0;
 
+	if (sock_flag(&tfile->sk, SOCK_XDP))
+		tun_peek_xsk(tfile);
+
 	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
 	tun_put(tun);
 
@@ -3473,6 +3639,7 @@  static int tun_chr_open(struct inode *inode, struct file * file)
 	}
 
 	mutex_init(&tfile->napi_mutex);
+	spin_lock_init(&tfile->pool_lock);
 	RCU_INIT_POINTER(tfile->tun, NULL);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 077e74421558..eb83764be26c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -202,6 +202,10 @@  static int vhost_net_buf_peek_len(void *ptr)
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
 		return xdpf->len;
+	} else if (tun_is_xdp_desc_frame(ptr)) {
+		struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+
+		return desc->len;
 	}
 
 	return __skb_array_len_with_tag(ptr);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 043d442994b0..4142453b5e52 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -6,10 +6,12 @@ 
 #ifndef __IF_TUN_H
 #define __IF_TUN_H
 
+#include <uapi/linux/if_xdp.h>
 #include <uapi/linux/if_tun.h>
 #include <uapi/linux/virtio_net.h>
 
 #define TUN_XDP_FLAG 0x1UL
+#define TUN_XDP_DESC_FLAG 0x2UL
 
 #define TUN_MSG_UBUF 1
 #define TUN_MSG_PTR  2
@@ -43,6 +45,21 @@  static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
 	return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
 }
 
+static inline bool tun_is_xdp_desc_frame(void *ptr)
+{
+	return (unsigned long)ptr & TUN_XDP_DESC_FLAG;
+}
+
+static inline void *tun_xdp_desc_to_ptr(struct xdp_desc *desc)
+{
+	return (void *)((unsigned long)desc | TUN_XDP_DESC_FLAG);
+}
+
+static inline struct xdp_desc *tun_ptr_to_xdp_desc(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~TUN_XDP_DESC_FLAG);
+}
+
 void tun_ptr_free(void *ptr);
 #else
 #include <linux/err.h>
@@ -75,6 +92,21 @@  static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
 	return NULL;
 }
 
+static inline bool tun_is_xdp_desc_frame(void *ptr)
+{
+	return false;
+}
+
+static inline void *tun_xdp_desc_to_ptr(struct xdp_desc *desc)
+{
+	return NULL;
+}
+
+static inline struct xdp_frame *tun_ptr_to_xdp_desc(void *ptr)
+{
+	return NULL;
+}
+
 static inline void tun_ptr_free(void *ptr)
 {
 }