diff mbox series

[net-next,v4,08/10] virtio-net: xsk zero copy xmit setup

Message ID 20210413031523.73507-9-xuanzhuo@linux.alibaba.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series virtio-net support xdp socket zero copy xmit | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 5 maintainers not CCed: yhs@fb.com kpsingh@kernel.org andrii@kernel.org kafai@fb.com songliubraving@fb.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 108 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Xuan Zhuo April 13, 2021, 3:15 a.m. UTC
xsk is a high-performance packet receiving and sending technology.

This patch implements the binding and unbinding operations of xsk and
the virtio-net queue for xsk zero copy xmit.

The xsk zero copy xmit depends on tx napi. So if tx napi is not true,
an error will be reported. And the entire operation is under the
protection of rtnl_lock.

If xsk is active, it will prevent ethtool from modifying tx napi.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
---
 drivers/net/virtio_net.c | 78 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

Comments

Jason Wang April 14, 2021, 4:01 a.m. UTC | #1
在 2021/4/13 上午11:15, Xuan Zhuo 写道:
> xsk is a high-performance packet receiving and sending technology.
>
> This patch implements the binding and unbinding operations of xsk and
> the virtio-net queue for xsk zero copy xmit.
>
> The xsk zero copy xmit depends on tx napi.


It's better to describe why zero copy depends on tx napi.


>   So if tx napi is not true,
> an error will be reported. And the entire operation is under the
> protection of rtnl_lock.
>
> If xsk is active, it will prevent ethtool from modifying tx napi.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
>   drivers/net/virtio_net.c | 78 +++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 77 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index f52a25091322..8242a9e9f17d 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -22,6 +22,7 @@
>   #include <net/route.h>
>   #include <net/xdp.h>
>   #include <net/net_failover.h>
> +#include <net/xdp_sock_drv.h>
>   
>   static int napi_weight = NAPI_POLL_WEIGHT;
>   module_param(napi_weight, int, 0444);
> @@ -133,6 +134,11 @@ struct send_queue {
>   	struct virtnet_sq_stats stats;
>   
>   	struct napi_struct napi;
> +
> +	struct {
> +		/* xsk pool */


This comment is unnecessary since the code explains itself.


> +		struct xsk_buff_pool __rcu *pool;
> +	} xsk;
>   };
>   
>   /* Internal representation of a receive virtqueue */
> @@ -2249,8 +2255,19 @@ static int virtnet_set_coalesce(struct net_device *dev,
>   	if (napi_weight ^ vi->sq[0].napi.weight) {
>   		if (dev->flags & IFF_UP)
>   			return -EBUSY;
> -		for (i = 0; i < vi->max_queue_pairs; i++)
> +		for (i = 0; i < vi->max_queue_pairs; i++) {
> +			/* xsk xmit depend on the tx napi. So if xsk is active,
> +			 * prevent modifications to tx napi.
> +			 */
> +			rcu_read_lock();
> +			if (rcu_dereference(vi->sq[i].xsk.pool)) {


Let's use rtnl_derefernece() then the rcu_read_lock()/unlock() is not 
needed.


> +				rcu_read_unlock();
> +				continue;
> +			}
> +			rcu_read_unlock();
> +
>   			vi->sq[i].napi.weight = napi_weight;
> +		}
>   	}
>   
>   	return 0;
> @@ -2518,11 +2535,70 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>   	return err;
>   }
>   
> +static int virtnet_xsk_pool_enable(struct net_device *dev,
> +				   struct xsk_buff_pool *pool,
> +				   u16 qid)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	struct send_queue *sq;
> +
> +	if (qid >= vi->curr_queue_pairs)
> +		return -EINVAL;
> +
> +	sq = &vi->sq[qid];
> +
> +	/* xsk zerocopy depend on the tx napi.
> +	 *
> +	 * xsk zerocopy xmit is driven by the tx interrupt. When the device is
> +	 * not busy, napi will be called continuously to send data. When the
> +	 * device is busy, wait for the notification interrupt after the
> +	 * hardware has finished processing the data, and continue to send data
> +	 * in napi.
> +	 */
> +	if (!sq->napi.weight)
> +		return -EPERM;
> +
> +	rcu_read_lock();
> +	/* Here is already protected by rtnl_lock, so rcu_assign_pointer is
> +	 * safe.
> +	 */
> +	rcu_assign_pointer(sq->xsk.pool, pool);
> +	rcu_read_unlock();


Any reason for the rcu lock here? And don't we need to synchronize rcu here?


> +
> +	return 0;
> +}
> +
> +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	struct send_queue *sq;
> +
> +	if (qid >= vi->curr_queue_pairs)
> +		return -EINVAL;
> +
> +	sq = &vi->sq[qid];
> +
> +	/* Here is already protected by rtnl_lock, so rcu_assign_pointer is
> +	 * safe.
> +	 */
> +	rcu_assign_pointer(sq->xsk.pool, NULL);
> +
> +	synchronize_net(); /* Sync with the XSK wakeup and with NAPI. */


Let's move the comment above the code.

Thanks


> +
> +	return 0;
> +}
> +
>   static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>   {
>   	switch (xdp->command) {
>   	case XDP_SETUP_PROG:
>   		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
> +	case XDP_SETUP_XSK_POOL:
> +		if (xdp->xsk.pool)
> +			return virtnet_xsk_pool_enable(dev, xdp->xsk.pool,
> +						       xdp->xsk.queue_id);
> +		else
> +			return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id);
>   	default:
>   		return -EINVAL;
>   	}
Magnus Karlsson April 14, 2021, 7:36 a.m. UTC | #2
On Tue, Apr 13, 2021 at 9:58 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> xsk is a high-performance packet receiving and sending technology.
>
> This patch implements the binding and unbinding operations of xsk and
> the virtio-net queue for xsk zero copy xmit.
>
> The xsk zero copy xmit depends on tx napi. So if tx napi is not true,
> an error will be reported. And the entire operation is under the
> protection of rtnl_lock.
>
> If xsk is active, it will prevent ethtool from modifying tx napi.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
>  drivers/net/virtio_net.c | 78 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 77 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index f52a25091322..8242a9e9f17d 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -22,6 +22,7 @@
>  #include <net/route.h>
>  #include <net/xdp.h>
>  #include <net/net_failover.h>
> +#include <net/xdp_sock_drv.h>
>
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -133,6 +134,11 @@ struct send_queue {
>         struct virtnet_sq_stats stats;
>
>         struct napi_struct napi;
> +
> +       struct {
> +               /* xsk pool */
> +               struct xsk_buff_pool __rcu *pool;
> +       } xsk;
>  };
>
>  /* Internal representation of a receive virtqueue */
> @@ -2249,8 +2255,19 @@ static int virtnet_set_coalesce(struct net_device *dev,
>         if (napi_weight ^ vi->sq[0].napi.weight) {
>                 if (dev->flags & IFF_UP)
>                         return -EBUSY;
> -               for (i = 0; i < vi->max_queue_pairs; i++)
> +               for (i = 0; i < vi->max_queue_pairs; i++) {
> +                       /* xsk xmit depend on the tx napi. So if xsk is active,
> +                        * prevent modifications to tx napi.
> +                        */
> +                       rcu_read_lock();
> +                       if (rcu_dereference(vi->sq[i].xsk.pool)) {
> +                               rcu_read_unlock();
> +                               continue;
> +                       }
> +                       rcu_read_unlock();
> +
>                         vi->sq[i].napi.weight = napi_weight;
> +               }
>         }
>
>         return 0;
> @@ -2518,11 +2535,70 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>         return err;
>  }
>
> +static int virtnet_xsk_pool_enable(struct net_device *dev,
> +                                  struct xsk_buff_pool *pool,
> +                                  u16 qid)
> +{
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       struct send_queue *sq;
> +
> +       if (qid >= vi->curr_queue_pairs)
> +               return -EINVAL;

Your implementation is the first implementation that only supports
zerocopy for one out of Rx and Tx, and this will currently confuse the
control plane in some situations since it assumes that both Rx and Tx
are enabled by a call to this NDO. For example: user space creates an
xsk socket with both an Rx and a Tx ring, then calls bind with the
XDP_ZEROCOPY flag. In this case, the call should fail if the device is
virtio-net since it only supports zerocopy for Tx. But it should
succeed if the user only created a Tx ring since that makes it a
Tx-only socket which can be supported.

So you need to introduce a new interface in xdp_sock_drv.h that can be
used to ask if this socket has Rx enabled and if so fail the call (at
least one of them has to be enabled, otherwise the bind call would
fail before this ndo is called). Then the logic above will act on that
and try to fall back to copy mode (if allowed). Such an interface
(with an added "is_tx_enabled") might in the future be useful for
physical NIC drivers too if they would like to save on resources for
Tx-only and Rx-only sockets. Currently, they all just assume every
socket is Rx and Tx.

Thanks: Magnus

> +
> +       sq = &vi->sq[qid];
> +
> +       /* xsk zerocopy depend on the tx napi.
> +        *
> +        * xsk zerocopy xmit is driven by the tx interrupt. When the device is
> +        * not busy, napi will be called continuously to send data. When the
> +        * device is busy, wait for the notification interrupt after the
> +        * hardware has finished processing the data, and continue to send data
> +        * in napi.
> +        */
> +       if (!sq->napi.weight)
> +               return -EPERM;
> +
> +       rcu_read_lock();
> +       /* Here is already protected by rtnl_lock, so rcu_assign_pointer is
> +        * safe.
> +        */
> +       rcu_assign_pointer(sq->xsk.pool, pool);
> +       rcu_read_unlock();
> +
> +       return 0;
> +}
> +
> +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
> +{
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       struct send_queue *sq;
> +
> +       if (qid >= vi->curr_queue_pairs)
> +               return -EINVAL;
> +
> +       sq = &vi->sq[qid];
> +
> +       /* Here is already protected by rtnl_lock, so rcu_assign_pointer is
> +        * safe.
> +        */
> +       rcu_assign_pointer(sq->xsk.pool, NULL);
> +
> +       synchronize_net(); /* Sync with the XSK wakeup and with NAPI. */
> +
> +       return 0;
> +}
> +
>  static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  {
>         switch (xdp->command) {
>         case XDP_SETUP_PROG:
>                 return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
> +       case XDP_SETUP_XSK_POOL:
> +               if (xdp->xsk.pool)
> +                       return virtnet_xsk_pool_enable(dev, xdp->xsk.pool,
> +                                                      xdp->xsk.queue_id);
> +               else
> +                       return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id);
>         default:
>                 return -EINVAL;
>         }
> --
> 2.31.0
>
Jason Wang April 14, 2021, 7:51 a.m. UTC | #3
在 2021/4/14 下午3:36, Magnus Karlsson 写道:
> On Tue, Apr 13, 2021 at 9:58 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>> xsk is a high-performance packet receiving and sending technology.
>>
>> This patch implements the binding and unbinding operations of xsk and
>> the virtio-net queue for xsk zero copy xmit.
>>
>> The xsk zero copy xmit depends on tx napi. So if tx napi is not true,
>> an error will be reported. And the entire operation is under the
>> protection of rtnl_lock.
>>
>> If xsk is active, it will prevent ethtool from modifying tx napi.
>>
>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
>> ---
>>   drivers/net/virtio_net.c | 78 +++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 77 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index f52a25091322..8242a9e9f17d 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -22,6 +22,7 @@
>>   #include <net/route.h>
>>   #include <net/xdp.h>
>>   #include <net/net_failover.h>
>> +#include <net/xdp_sock_drv.h>
>>
>>   static int napi_weight = NAPI_POLL_WEIGHT;
>>   module_param(napi_weight, int, 0444);
>> @@ -133,6 +134,11 @@ struct send_queue {
>>          struct virtnet_sq_stats stats;
>>
>>          struct napi_struct napi;
>> +
>> +       struct {
>> +               /* xsk pool */
>> +               struct xsk_buff_pool __rcu *pool;
>> +       } xsk;
>>   };
>>
>>   /* Internal representation of a receive virtqueue */
>> @@ -2249,8 +2255,19 @@ static int virtnet_set_coalesce(struct net_device *dev,
>>          if (napi_weight ^ vi->sq[0].napi.weight) {
>>                  if (dev->flags & IFF_UP)
>>                          return -EBUSY;
>> -               for (i = 0; i < vi->max_queue_pairs; i++)
>> +               for (i = 0; i < vi->max_queue_pairs; i++) {
>> +                       /* xsk xmit depend on the tx napi. So if xsk is active,
>> +                        * prevent modifications to tx napi.
>> +                        */
>> +                       rcu_read_lock();
>> +                       if (rcu_dereference(vi->sq[i].xsk.pool)) {
>> +                               rcu_read_unlock();
>> +                               continue;
>> +                       }
>> +                       rcu_read_unlock();
>> +
>>                          vi->sq[i].napi.weight = napi_weight;
>> +               }
>>          }
>>
>>          return 0;
>> @@ -2518,11 +2535,70 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>>          return err;
>>   }
>>
>> +static int virtnet_xsk_pool_enable(struct net_device *dev,
>> +                                  struct xsk_buff_pool *pool,
>> +                                  u16 qid)
>> +{
>> +       struct virtnet_info *vi = netdev_priv(dev);
>> +       struct send_queue *sq;
>> +
>> +       if (qid >= vi->curr_queue_pairs)
>> +               return -EINVAL;
> Your implementation is the first implementation that only supports
> zerocopy for one out of Rx and Tx, and this will currently confuse the
> control plane in some situations since it assumes that both Rx and Tx
> are enabled by a call to this NDO. For example: user space creates an
> xsk socket with both an Rx and a Tx ring, then calls bind with the
> XDP_ZEROCOPY flag. In this case, the call should fail if the device is
> virtio-net since it only supports zerocopy for Tx. But it should
> succeed if the user only created a Tx ring since that makes it a
> Tx-only socket which can be supported.
>
> So you need to introduce a new interface in xdp_sock_drv.h that can be
> used to ask if this socket has Rx enabled and if so fail the call (at
> least one of them has to be enabled, otherwise the bind call would
> fail before this ndo is called). Then the logic above will act on that
> and try to fall back to copy mode (if allowed). Such an interface
> (with an added "is_tx_enabled") might in the future be useful for
> physical NIC drivers too if they would like to save on resources for
> Tx-only and Rx-only sockets. Currently, they all just assume every
> socket is Rx and Tx.


So if there's no blocker for implementing the zerocopy RX, I think we'd 
better to implement it in this series without introducing new APIs for 
the upper layer.

Thanks


>
> Thanks: Magnus
>
>> +
>> +       sq = &vi->sq[qid];
>> +
>> +       /* xsk zerocopy depend on the tx napi.
>> +        *
>> +        * xsk zerocopy xmit is driven by the tx interrupt. When the device is
>> +        * not busy, napi will be called continuously to send data. When the
>> +        * device is busy, wait for the notification interrupt after the
>> +        * hardware has finished processing the data, and continue to send data
>> +        * in napi.
>> +        */
>> +       if (!sq->napi.weight)
>> +               return -EPERM;
>> +
>> +       rcu_read_lock();
>> +       /* Here is already protected by rtnl_lock, so rcu_assign_pointer is
>> +        * safe.
>> +        */
>> +       rcu_assign_pointer(sq->xsk.pool, pool);
>> +       rcu_read_unlock();
>> +
>> +       return 0;
>> +}
>> +
>> +static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
>> +{
>> +       struct virtnet_info *vi = netdev_priv(dev);
>> +       struct send_queue *sq;
>> +
>> +       if (qid >= vi->curr_queue_pairs)
>> +               return -EINVAL;
>> +
>> +       sq = &vi->sq[qid];
>> +
>> +       /* Here is already protected by rtnl_lock, so rcu_assign_pointer is
>> +        * safe.
>> +        */
>> +       rcu_assign_pointer(sq->xsk.pool, NULL);
>> +
>> +       synchronize_net(); /* Sync with the XSK wakeup and with NAPI. */
>> +
>> +       return 0;
>> +}
>> +
>>   static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>>   {
>>          switch (xdp->command) {
>>          case XDP_SETUP_PROG:
>>                  return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
>> +       case XDP_SETUP_XSK_POOL:
>> +               if (xdp->xsk.pool)
>> +                       return virtnet_xsk_pool_enable(dev, xdp->xsk.pool,
>> +                                                      xdp->xsk.queue_id);
>> +               else
>> +                       return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id);
>>          default:
>>                  return -EINVAL;
>>          }
>> --
>> 2.31.0
>>
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f52a25091322..8242a9e9f17d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -22,6 +22,7 @@ 
 #include <net/route.h>
 #include <net/xdp.h>
 #include <net/net_failover.h>
+#include <net/xdp_sock_drv.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -133,6 +134,11 @@  struct send_queue {
 	struct virtnet_sq_stats stats;
 
 	struct napi_struct napi;
+
+	struct {
+		/* xsk pool */
+		struct xsk_buff_pool __rcu *pool;
+	} xsk;
 };
 
 /* Internal representation of a receive virtqueue */
@@ -2249,8 +2255,19 @@  static int virtnet_set_coalesce(struct net_device *dev,
 	if (napi_weight ^ vi->sq[0].napi.weight) {
 		if (dev->flags & IFF_UP)
 			return -EBUSY;
-		for (i = 0; i < vi->max_queue_pairs; i++)
+		for (i = 0; i < vi->max_queue_pairs; i++) {
+			/* xsk xmit depend on the tx napi. So if xsk is active,
+			 * prevent modifications to tx napi.
+			 */
+			rcu_read_lock();
+			if (rcu_dereference(vi->sq[i].xsk.pool)) {
+				rcu_read_unlock();
+				continue;
+			}
+			rcu_read_unlock();
+
 			vi->sq[i].napi.weight = napi_weight;
+		}
 	}
 
 	return 0;
@@ -2518,11 +2535,70 @@  static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	return err;
 }
 
+static int virtnet_xsk_pool_enable(struct net_device *dev,
+				   struct xsk_buff_pool *pool,
+				   u16 qid)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct send_queue *sq;
+
+	if (qid >= vi->curr_queue_pairs)
+		return -EINVAL;
+
+	sq = &vi->sq[qid];
+
+	/* xsk zerocopy depend on the tx napi.
+	 *
+	 * xsk zerocopy xmit is driven by the tx interrupt. When the device is
+	 * not busy, napi will be called continuously to send data. When the
+	 * device is busy, wait for the notification interrupt after the
+	 * hardware has finished processing the data, and continue to send data
+	 * in napi.
+	 */
+	if (!sq->napi.weight)
+		return -EPERM;
+
+	rcu_read_lock();
+	/* Here is already protected by rtnl_lock, so rcu_assign_pointer is
+	 * safe.
+	 */
+	rcu_assign_pointer(sq->xsk.pool, pool);
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct send_queue *sq;
+
+	if (qid >= vi->curr_queue_pairs)
+		return -EINVAL;
+
+	sq = &vi->sq[qid];
+
+	/* Here is already protected by rtnl_lock, so rcu_assign_pointer is
+	 * safe.
+	 */
+	rcu_assign_pointer(sq->xsk.pool, NULL);
+
+	synchronize_net(); /* Sync with the XSK wakeup and with NAPI. */
+
+	return 0;
+}
+
 static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_SETUP_XSK_POOL:
+		if (xdp->xsk.pool)
+			return virtnet_xsk_pool_enable(dev, xdp->xsk.pool,
+						       xdp->xsk.queue_id);
+		else
+			return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id);
 	default:
 		return -EINVAL;
 	}