diff mbox series

[3/6] vsock: add netdev to vhost/virtio vsock

Message ID 5a93c5aad99d79f028d349cb7e3c128c65d5d7e2.1660362668.git.bobby.eshleman@bytedance.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series virtio/vsock: introduce dgrams, sk_buff, and qdisc | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 433 this patch: 433
netdev/cc_maintainers success CCed 11 of 11 maintainers
netdev/build_clang fail Errors and warnings before: 7 this patch: 7
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 433 this patch: 433
netdev/checkpatch warning WARNING: From:/Signed-off-by: email address mismatch: 'From: Bobby Eshleman <bobby.eshleman@gmail.com>' != 'Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>' WARNING: line length of 91 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline success Was 0 now: 0

Commit Message

Bobby Eshleman Aug. 15, 2022, 5:56 p.m. UTC
In order to support usage of qdisc on vsock traffic, this commit
introduces a struct net_device to vhost and virtio vsock.

Two new devices are created, vhost-vsock for vhost and virtio-vsock
for virtio. The devices are attached to the respective transports.

To bypass the usage of the device, the user may "down" the associated
network interface using common tools. For example, "ip link set dev
virtio-vsock down" lets vsock bypass the net_device and qdisc entirely,
simply using the FIFO logic of the prior implementation.

For both hosts and guests, there is one device for all G2H vsock sockets
and one device for all H2G vsock sockets. This makes sense for guests
because the driver only supports a single vsock channel (one pair of
TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not
seem ideal for some workloads. However, it is possible to use a
multi-queue qdisc, where a given queue is responsible for a range of
sockets. This seems to be a better solution than having one device per
socket, which may yield a very large number of devices and qdiscs, all
of which are dynamically being created and destroyed. Because of this
dynamism, it would also require a complex policy management daemon, as
devices would constantly be spun up and down as sockets were created and
destroyed. To avoid this, one device and qdisc also applies to all H2G
sockets.

Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
---
 drivers/vhost/vsock.c                   |  19 +++-
 include/linux/virtio_vsock.h            |  10 +++
 net/vmw_vsock/virtio_transport.c        |  19 +++-
 net/vmw_vsock/virtio_transport_common.c | 112 +++++++++++++++++++++++-
 4 files changed, 152 insertions(+), 8 deletions(-)

Comments

Bobby Eshleman Aug. 16, 2022, 2:31 a.m. UTC | #1
CC'ing virtio-dev@lists.oasis-open.org

On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote:
> In order to support usage of qdisc on vsock traffic, this commit
> introduces a struct net_device to vhost and virtio vsock.
> 
> Two new devices are created, vhost-vsock for vhost and virtio-vsock
> for virtio. The devices are attached to the respective transports.
> 
> To bypass the usage of the device, the user may "down" the associated
> network interface using common tools. For example, "ip link set dev
> virtio-vsock down" lets vsock bypass the net_device and qdisc entirely,
> simply using the FIFO logic of the prior implementation.
> 
> For both hosts and guests, there is one device for all G2H vsock sockets
> and one device for all H2G vsock sockets. This makes sense for guests
> because the driver only supports a single vsock channel (one pair of
> TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not
> seem ideal for some workloads. However, it is possible to use a
> multi-queue qdisc, where a given queue is responsible for a range of
> sockets. This seems to be a better solution than having one device per
> socket, which may yield a very large number of devices and qdiscs, all
> of which are dynamically being created and destroyed. Because of this
> dynamism, it would also require a complex policy management daemon, as
> devices would constantly be spun up and down as sockets were created and
> destroyed. To avoid this, one device and qdisc also applies to all H2G
> sockets.
> 
> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> ---
>  drivers/vhost/vsock.c                   |  19 +++-
>  include/linux/virtio_vsock.h            |  10 +++
>  net/vmw_vsock/virtio_transport.c        |  19 +++-
>  net/vmw_vsock/virtio_transport_common.c | 112 +++++++++++++++++++++++-
>  4 files changed, 152 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index f8601d93d94d..b20ddec2664b 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -927,13 +927,30 @@ static int __init vhost_vsock_init(void)
>  				  VSOCK_TRANSPORT_F_H2G);
>  	if (ret < 0)
>  		return ret;
> -	return misc_register(&vhost_vsock_misc);
> +
> +	ret = virtio_transport_init(&vhost_transport, "vhost-vsock");
> +	if (ret < 0)
> +		goto out_unregister;
> +
> +	ret = misc_register(&vhost_vsock_misc);
> +	if (ret < 0)
> +		goto out_transport_exit;
> +	return ret;
> +
> +out_transport_exit:
> +	virtio_transport_exit(&vhost_transport);
> +
> +out_unregister:
> +	vsock_core_unregister(&vhost_transport.transport);
> +	return ret;
> +
>  };
>  
>  static void __exit vhost_vsock_exit(void)
>  {
>  	misc_deregister(&vhost_vsock_misc);
>  	vsock_core_unregister(&vhost_transport.transport);
> +	virtio_transport_exit(&vhost_transport);
>  };
>  
>  module_init(vhost_vsock_init);
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index 9a37eddbb87a..5d7e7fbd75f8 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -91,10 +91,20 @@ struct virtio_transport {
>  	/* This must be the first field */
>  	struct vsock_transport transport;
>  
> +	/* Used almost exclusively for qdisc */
> +	struct net_device *dev;
> +
>  	/* Takes ownership of the packet */
>  	int (*send_pkt)(struct sk_buff *skb);
>  };
>  
> +int
> +virtio_transport_init(struct virtio_transport *t,
> +		      const char *name);
> +
> +void
> +virtio_transport_exit(struct virtio_transport *t);
> +
>  ssize_t
>  virtio_transport_stream_dequeue(struct vsock_sock *vsk,
>  				struct msghdr *msg,
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index 3bb293fd8607..c6212eb38d3c 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -131,7 +131,9 @@ virtio_transport_send_pkt_work(struct work_struct *work)
>  		 * the vq
>  		 */
>  		if (ret < 0) {
> -			skb_queue_head(&vsock->send_pkt_queue, skb);
> +			spin_lock_bh(&vsock->send_pkt_queue.lock);
> +			__skb_queue_head(&vsock->send_pkt_queue, skb);
> +			spin_unlock_bh(&vsock->send_pkt_queue.lock);
>  			break;
>  		}
>  
> @@ -676,7 +678,9 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
>  		kfree_skb(skb);
>  	mutex_unlock(&vsock->tx_lock);
>  
> -	skb_queue_purge(&vsock->send_pkt_queue);
> +	spin_lock_bh(&vsock->send_pkt_queue.lock);
> +	__skb_queue_purge(&vsock->send_pkt_queue);
> +	spin_unlock_bh(&vsock->send_pkt_queue.lock);
>  
>  	/* Delete virtqueues and flush outstanding callbacks if any */
>  	vdev->config->del_vqs(vdev);
> @@ -760,6 +764,8 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
>  	flush_work(&vsock->event_work);
>  	flush_work(&vsock->send_pkt_work);
>  
> +	virtio_transport_exit(&virtio_transport);
> +
>  	mutex_unlock(&the_virtio_vsock_mutex);
>  
>  	kfree(vsock);
> @@ -844,12 +850,18 @@ static int __init virtio_vsock_init(void)
>  	if (ret)
>  		goto out_wq;
>  
> -	ret = register_virtio_driver(&virtio_vsock_driver);
> +	ret = virtio_transport_init(&virtio_transport, "virtio-vsock");
>  	if (ret)
>  		goto out_vci;
>  
> +	ret = register_virtio_driver(&virtio_vsock_driver);
> +	if (ret)
> +		goto out_transport;
> +
>  	return 0;
>  
> +out_transport:
> +	virtio_transport_exit(&virtio_transport);
>  out_vci:
>  	vsock_core_unregister(&virtio_transport.transport);
>  out_wq:
> @@ -861,6 +873,7 @@ static void __exit virtio_vsock_exit(void)
>  {
>  	unregister_virtio_driver(&virtio_vsock_driver);
>  	vsock_core_unregister(&virtio_transport.transport);
> +	virtio_transport_exit(&virtio_transport);
>  	destroy_workqueue(virtio_vsock_workqueue);
>  }
>  
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index d5780599fe93..bdf16fff054f 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -16,6 +16,7 @@
>  
>  #include <net/sock.h>
>  #include <net/af_vsock.h>
> +#include <net/pkt_sched.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/vsock_virtio_transport_common.h>
> @@ -23,6 +24,93 @@
>  /* How long to wait for graceful shutdown of a connection */
>  #define VSOCK_CLOSE_TIMEOUT (8 * HZ)
>  
> +struct virtio_transport_priv {
> +	struct virtio_transport *trans;
> +};
> +
> +static netdev_tx_t virtio_transport_start_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct virtio_transport *t =
> +		((struct virtio_transport_priv *)netdev_priv(dev))->trans;
> +	int ret;
> +
> +	ret = t->send_pkt(skb);
> +	if (unlikely(ret == -ENODEV))
> +		return NETDEV_TX_BUSY;
> +
> +	return NETDEV_TX_OK;
> +}
> +
> +const struct net_device_ops virtio_transport_netdev_ops = {
> +	.ndo_start_xmit = virtio_transport_start_xmit,
> +};
> +
> +static void virtio_transport_setup(struct net_device *dev)
> +{
> +	dev->netdev_ops = &virtio_transport_netdev_ops;
> +	dev->needs_free_netdev = true;
> +	dev->flags = IFF_NOARP;
> +	dev->mtu = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
> +	dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
> +}
> +
> +static int ifup(struct net_device *dev)
> +{
> +	int ret;
> +
> +	rtnl_lock();
> +	ret = dev_open(dev, NULL) ? -ENOMEM : 0;
> +	rtnl_unlock();
> +
> +	return ret;
> +}
> +
> +/* virtio_transport_init - initialize a virtio vsock transport layer
> + *
> + * @t: ptr to the virtio transport struct to initialize
> + * @name: the name of the net_device to be created.
> + *
> + * Return 0 on success, otherwise negative errno.
> + */
> +int virtio_transport_init(struct virtio_transport *t, const char *name)
> +{
> +	struct virtio_transport_priv *priv;
> +	int ret;
> +
> +	t->dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, virtio_transport_setup);
> +	if (!t->dev)
> +		return -ENOMEM;
> +
> +	priv = netdev_priv(t->dev);
> +	priv->trans = t;
> +
> +	ret = register_netdev(t->dev);
> +	if (ret < 0)
> +		goto out_free_netdev;
> +
> +	ret = ifup(t->dev);
> +	if (ret < 0)
> +		goto out_unregister_netdev;
> +
> +	return 0;
> +
> +out_unregister_netdev:
> +	unregister_netdev(t->dev);
> +
> +out_free_netdev:
> +	free_netdev(t->dev);
> +
> +	return ret;
> +}
> +
> +void virtio_transport_exit(struct virtio_transport *t)
> +{
> +	if (t->dev) {
> +		unregister_netdev(t->dev);
> +		free_netdev(t->dev);
> +	}
> +}
> +
>  static const struct virtio_transport *
>  virtio_transport_get_ops(struct vsock_sock *vsk)
>  {
> @@ -147,6 +235,24 @@ static u16 virtio_transport_get_type(struct sock *sk)
>  		return VIRTIO_VSOCK_TYPE_SEQPACKET;
>  }
>  
> +/* Return pkt->len on success, otherwise negative errno */
> +static int virtio_transport_send_pkt(const struct virtio_transport *t, struct sk_buff *skb)
> +{
> +	int ret;
> +	int len = skb->len;
> +
> +	if (unlikely(!t->dev || !(t->dev->flags & IFF_UP)))
> +		return t->send_pkt(skb);
> +
> +	skb->dev = t->dev;
> +	ret = dev_queue_xmit(skb);
> +
> +	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN))
> +		return len;
> +
> +	return -ENOMEM;
> +}
> +
>  /* This function can only be used on connecting/connected sockets,
>   * since a socket assigned to a transport is required.
>   *
> @@ -202,9 +308,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>  
>  	virtio_transport_inc_tx_pkt(vvs, skb);
>  
> -	err = t_ops->send_pkt(skb);
> -
> -	return err < 0 ? -ENOMEM : err;
> +	return virtio_transport_send_pkt(t_ops, skb);
>  }
>  
>  static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
> @@ -834,7 +938,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
>  		return -ENOTCONN;
>  	}
>  
> -	return t->send_pkt(reply);
> +	return virtio_transport_send_pkt(t, reply);
>  }
>  
>  /* This function should be called with sk_lock held and SOCK_DONE set */
> -- 
> 2.35.1
>
Bobby Eshleman Aug. 16, 2022, 6:18 a.m. UTC | #2
On Tue, Aug 16, 2022 at 12:38:52PM -0400, Michael S. Tsirkin wrote:
> On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote:
> > In order to support usage of qdisc on vsock traffic, this commit
> > introduces a struct net_device to vhost and virtio vsock.
> > 
> > Two new devices are created, vhost-vsock for vhost and virtio-vsock
> > for virtio. The devices are attached to the respective transports.
> > 
> > To bypass the usage of the device, the user may "down" the associated
> > network interface using common tools. For example, "ip link set dev
> > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely,
> > simply using the FIFO logic of the prior implementation.
> 
> Ugh. That's quite a hack. Mark my words, at some point we will decide to
> have down mean "down".  Besides, multiple net devices with link up tend
> to confuse userspace. So might want to keep it down at all times
> even short term.
> 

I have to admit, this choice was born more of perceived necessity than
of a love for the design... but I can explain the pain points that led
to the current state, which I hope sparks some discussion.

When the state is down, dev_queue_xmit() will fail. To avoid this and
preserve the "zero-configuration" guarantee of vsock, I chose to make
transmission work regardless of device state by implementing this
"ignore up/down state" hack.

This is unfortunate because what we are really after here is just packet
scheduling, i.e., qdisc. We don't really need the rest of the
net_device, and I don't think up/down buys us anything of value. The
relationship between qdisc and net_device is so tightly knit together
though, that using qdisc without a net_device doesn't look very
practical (and maybe impossible).

Some alternative routes might be:

1) Default the state to up, and let users disable vsock by downing the
   device if they'd like. It still works out-of-the-box, but if users
   really want to disable vsock they may.

2) vsock may simply turn the device to state up when a socket is first
   used. For instance, the HCI device in net/bluetooth/hci_* uses a
   technique where the net_device is turned to up when bind() is called on
   any HCI socket (BTPROTO_HCI). It can also be turned up/down via
   ioctl().

3) Modify net_device registration to allow us to have an invisible
   device that is only known by the kernel. It may default to up and remain
   unchanged. The qdisc controls alone may be exposed to userspace,
   hopefully via netlink to still work with tc. This is not
   currently supported by register_netdevice(), but a series from 2007 was
   sent to the ML, tentatively approved in concept, but was never merged[1].

4) Currently NETDEV_UP/NETDEV_DOWN commands can't be vetoed.
   NETDEV_PRE_UP, however, is used to effectively veto NETDEV_UP
   commands[2]. We could introduce NETDEV_PRE_DOWN to support vetoing of
   NETDEV_DOWN. This would allow us to install a hook to determine if
   we actually want to allow the device to be downed.

In an ideal world, we could simply pass a set of netdev queues, a
packet, and maybe a blob of state to qdisc and let it work its
scheduling magic...

Any thoughts?

[1]: https://lore.kernel.org/netdev/20070129140958.0cf6880f@freekitty/
[2]: https://lore.kernel.org/all/20090529.220906.243061042.davem@davemloft.net/

Thanks,
Bobby
Bobby Eshleman Aug. 16, 2022, 7:02 a.m. UTC | #3
On Tue, Aug 16, 2022 at 11:07:17AM -0700, Jakub Kicinski wrote:
> On Tue, 16 Aug 2022 12:38:52 -0400 Michael S. Tsirkin wrote:
> > On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote:
> > > In order to support usage of qdisc on vsock traffic, this commit
> > > introduces a struct net_device to vhost and virtio vsock.
> > > 
> > > Two new devices are created, vhost-vsock for vhost and virtio-vsock
> > > for virtio. The devices are attached to the respective transports.
> > > 
> > > To bypass the usage of the device, the user may "down" the associated
> > > network interface using common tools. For example, "ip link set dev
> > > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely,
> > > simply using the FIFO logic of the prior implementation.  
> > 
> > Ugh. That's quite a hack. Mark my words, at some point we will decide to
> > have down mean "down".  Besides, multiple net devices with link up tend
> > to confuse userspace. So might want to keep it down at all times
> > even short term.
> 
> Agreed!
> 
> From a cursory look (and Documentation/ would be nice..) it feels
> very wrong to me. Do you know of any uses of a netdev which would 
> be semantically similar to what you're doing? Treating netdevs as
> buildings blocks for arbitrary message passing solutions is something 
> I dislike quite strongly.

The big difference between vsock and "arbitrary message passing" is that
vsock is actually constrained by the virtio device that backs it (made
up of virtqueues and the underlying protocol). That virtqueue pair is
acting like the queues on a physical NIC, so it actually makes sense to
manage the queuing of vsock's device like we would manage the queueing
of a real device.

Still, I concede that ignoring the netdev state is a probably bad idea.

That said, I also think that using packet scheduling in vsock is a good
idea, and that ideally we can reuse Linux's already robust library of
packet scheduling algorithms by introducing qdisc somehow.

> 
> Could you recommend where I can learn more about vsocks?

I think the spec is probably the best place to start[1].

[1]: https://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html

Best,
Bobby
Bobby Eshleman Aug. 16, 2022, 8:29 a.m. UTC | #4
On Tue, Aug 16, 2022 at 04:07:55PM -0700, Jakub Kicinski wrote:
> On Tue, 16 Aug 2022 07:02:33 +0000 Bobby Eshleman wrote:
> > > From a cursory look (and Documentation/ would be nice..) it feels
> > > very wrong to me. Do you know of any uses of a netdev which would 
> > > be semantically similar to what you're doing? Treating netdevs as
> > > buildings blocks for arbitrary message passing solutions is something 
> > > I dislike quite strongly.  
> > 
> > The big difference between vsock and "arbitrary message passing" is that
> > vsock is actually constrained by the virtio device that backs it (made
> > up of virtqueues and the underlying protocol). That virtqueue pair is
> > acting like the queues on a physical NIC, so it actually makes sense to
> > manage the queuing of vsock's device like we would manage the queueing
> > of a real device.
> > 
> > Still, I concede that ignoring the netdev state is a probably bad idea.
> > 
> > That said, I also think that using packet scheduling in vsock is a good
> > idea, and that ideally we can reuse Linux's already robust library of
> > packet scheduling algorithms by introducing qdisc somehow.
> 
> We've been burnt in the past by people doing the "let me just pick
> these useful pieces out of netdev" thing. Makes life hard both for
> maintainers and users trying to make sense of the interfaces.
> 
> What comes to mind if you're just after queuing is that we already
> bastardized the CoDel implementation (include/net/codel_impl.h).
> If CoDel is good enough for you maybe that's the easiest way?
> Although I suspect that you're after fairness not early drops.
> Wireless folks use CoDel as a second layer queuing. (CC: Toke)
> 

That is certainly interesting to me. Sitting next to "codel_impl.h" is
"include/net/fq_impl.h", and it looks like it may solve the datagram
flooding issue. The downside to this approach is the baking of a
specific policy into vsock... which I don't exactly love either.

I'm not seeing too many other of these qdisc bastardizations in
include/net, are there any others that you are aware of?

> > > Could you recommend where I can learn more about vsocks?  
> > 
> > I think the spec is probably the best place to start[1].
> > 
> > [1]: https://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html
> 
> Eh, I was hoping it was a side channel of an existing virtio_net 
> which is not the case. Given the zero-config requirement IDK if 
> we'll be able to fit this into netdev semantics :(

It's certainly possible that it may not fit :/ I feel that it partially
depends on what we mean by zero-config. Is it "no config required to
have a working socket" or is it "no config required, but also no
tuning/policy/etc... supported"?

Best,
Bobby
Bobby Eshleman Aug. 16, 2022, 10:50 a.m. UTC | #5
On Tue, Aug 16, 2022 at 06:15:28PM -0700, Jakub Kicinski wrote:
> On Tue, 16 Aug 2022 08:29:04 +0000 Bobby Eshleman wrote:
> > > We've been burnt in the past by people doing the "let me just pick
> > > these useful pieces out of netdev" thing. Makes life hard both for
> > > maintainers and users trying to make sense of the interfaces.
> > > 
> > > What comes to mind if you're just after queuing is that we already
> > > bastardized the CoDel implementation (include/net/codel_impl.h).
> > > If CoDel is good enough for you maybe that's the easiest way?
> > > Although I suspect that you're after fairness not early drops.
> > > Wireless folks use CoDel as a second layer queuing. (CC: Toke)
> > 
> > That is certainly interesting to me. Sitting next to "codel_impl.h" is
> > "include/net/fq_impl.h", and it looks like it may solve the datagram
> > flooding issue. The downside to this approach is the baking of a
> > specific policy into vsock... which I don't exactly love either.
> > 
> > I'm not seeing too many other of these qdisc bastardizations in
> > include/net, are there any others that you are aware of?
> 
> Just what wireless uses (so codel and fq as you found out), nothing
> else comes to mind.
> 
> > > Eh, I was hoping it was a side channel of an existing virtio_net 
> > > which is not the case. Given the zero-config requirement IDK if 
> > > we'll be able to fit this into netdev semantics :(  
> > 
> > It's certainly possible that it may not fit :/ I feel that it partially
> > depends on what we mean by zero-config. Is it "no config required to
> > have a working socket" or is it "no config required, but also no
> > tuning/policy/etc... supported"?
> 
> The value of tuning vs confusion of a strange netdev floating around
> in the system is hard to estimate upfront. 

I think "a strange netdev floating around" is a total
mischaracterization... vsock is a networking device and it supports
vsock networks. Sure, it is a virtual device and the routing is done in
host software, but the same is true for virtio-net and VM-to-VM vlan.

This patch actually uses netdev for its intended purpose: to support and
manage the transmission of packets via a network device to a network.

Furthermore, it actually prepares vsock to eliminate a "strange" use of
a netdev. The netdev in vsockmon isn't even used to transmit
packets, it's "floating around" for no other reason than it is needed to
support packet capture, which vsock couldn't support because it didn't
have a netdev.

Something smells when we are required to build workaround kernel modules
that use netdev for ciphoning packets off to userspace, when we could
instead be using netdev for its intended purpose and get the same and
more benefit.

> 
> The nice thing about using a built-in fq with no user visible knobs is
> that there's no extra uAPI. We can always rip it out and replace later.
> And it shouldn't be controversial, making the path to upstream smoother.

The issue is that after pulling in fq for one kind of flow management,
then as users observe other flow issues, we will need to re-implement
pfifo, and then TBF, and then we need to build an interface to let users
select one, and to choose queue sizes... and then after awhile we've
needlessly re-implemented huge chunks of the tc system.

I don't see any good reason to restrict vsock users to using suboptimal
and rigid queuing.

Thanks.
Michael S. Tsirkin Aug. 16, 2022, 4:38 p.m. UTC | #6
On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote:
> In order to support usage of qdisc on vsock traffic, this commit
> introduces a struct net_device to vhost and virtio vsock.
> 
> Two new devices are created, vhost-vsock for vhost and virtio-vsock
> for virtio. The devices are attached to the respective transports.
> 
> To bypass the usage of the device, the user may "down" the associated
> network interface using common tools. For example, "ip link set dev
> virtio-vsock down" lets vsock bypass the net_device and qdisc entirely,
> simply using the FIFO logic of the prior implementation.

Ugh. That's quite a hack. Mark my words, at some point we will decide to
have down mean "down".  Besides, multiple net devices with link up tend
to confuse userspace. So might want to keep it down at all times
even short term.

> For both hosts and guests, there is one device for all G2H vsock sockets
> and one device for all H2G vsock sockets. This makes sense for guests
> because the driver only supports a single vsock channel (one pair of
> TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not
> seem ideal for some workloads. However, it is possible to use a
> multi-queue qdisc, where a given queue is responsible for a range of
> sockets. This seems to be a better solution than having one device per
> socket, which may yield a very large number of devices and qdiscs, all
> of which are dynamically being created and destroyed. Because of this
> dynamism, it would also require a complex policy management daemon, as
> devices would constantly be spun up and down as sockets were created and
> destroyed. To avoid this, one device and qdisc also applies to all H2G
> sockets.
> 
> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> ---
>  drivers/vhost/vsock.c                   |  19 +++-
>  include/linux/virtio_vsock.h            |  10 +++
>  net/vmw_vsock/virtio_transport.c        |  19 +++-
>  net/vmw_vsock/virtio_transport_common.c | 112 +++++++++++++++++++++++-
>  4 files changed, 152 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index f8601d93d94d..b20ddec2664b 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -927,13 +927,30 @@ static int __init vhost_vsock_init(void)
>  				  VSOCK_TRANSPORT_F_H2G);
>  	if (ret < 0)
>  		return ret;
> -	return misc_register(&vhost_vsock_misc);
> +
> +	ret = virtio_transport_init(&vhost_transport, "vhost-vsock");
> +	if (ret < 0)
> +		goto out_unregister;
> +
> +	ret = misc_register(&vhost_vsock_misc);
> +	if (ret < 0)
> +		goto out_transport_exit;
> +	return ret;
> +
> +out_transport_exit:
> +	virtio_transport_exit(&vhost_transport);
> +
> +out_unregister:
> +	vsock_core_unregister(&vhost_transport.transport);
> +	return ret;
> +
>  };
>  
>  static void __exit vhost_vsock_exit(void)
>  {
>  	misc_deregister(&vhost_vsock_misc);
>  	vsock_core_unregister(&vhost_transport.transport);
> +	virtio_transport_exit(&vhost_transport);
>  };
>  
>  module_init(vhost_vsock_init);
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index 9a37eddbb87a..5d7e7fbd75f8 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -91,10 +91,20 @@ struct virtio_transport {
>  	/* This must be the first field */
>  	struct vsock_transport transport;
>  
> +	/* Used almost exclusively for qdisc */
> +	struct net_device *dev;
> +
>  	/* Takes ownership of the packet */
>  	int (*send_pkt)(struct sk_buff *skb);
>  };
>  
> +int
> +virtio_transport_init(struct virtio_transport *t,
> +		      const char *name);
> +
> +void
> +virtio_transport_exit(struct virtio_transport *t);
> +
>  ssize_t
>  virtio_transport_stream_dequeue(struct vsock_sock *vsk,
>  				struct msghdr *msg,
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index 3bb293fd8607..c6212eb38d3c 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -131,7 +131,9 @@ virtio_transport_send_pkt_work(struct work_struct *work)
>  		 * the vq
>  		 */
>  		if (ret < 0) {
> -			skb_queue_head(&vsock->send_pkt_queue, skb);
> +			spin_lock_bh(&vsock->send_pkt_queue.lock);
> +			__skb_queue_head(&vsock->send_pkt_queue, skb);
> +			spin_unlock_bh(&vsock->send_pkt_queue.lock);
>  			break;
>  		}
>  
> @@ -676,7 +678,9 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
>  		kfree_skb(skb);
>  	mutex_unlock(&vsock->tx_lock);
>  
> -	skb_queue_purge(&vsock->send_pkt_queue);
> +	spin_lock_bh(&vsock->send_pkt_queue.lock);
> +	__skb_queue_purge(&vsock->send_pkt_queue);
> +	spin_unlock_bh(&vsock->send_pkt_queue.lock);
>  
>  	/* Delete virtqueues and flush outstanding callbacks if any */
>  	vdev->config->del_vqs(vdev);
> @@ -760,6 +764,8 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
>  	flush_work(&vsock->event_work);
>  	flush_work(&vsock->send_pkt_work);
>  
> +	virtio_transport_exit(&virtio_transport);
> +
>  	mutex_unlock(&the_virtio_vsock_mutex);
>  
>  	kfree(vsock);
> @@ -844,12 +850,18 @@ static int __init virtio_vsock_init(void)
>  	if (ret)
>  		goto out_wq;
>  
> -	ret = register_virtio_driver(&virtio_vsock_driver);
> +	ret = virtio_transport_init(&virtio_transport, "virtio-vsock");
>  	if (ret)
>  		goto out_vci;
>  
> +	ret = register_virtio_driver(&virtio_vsock_driver);
> +	if (ret)
> +		goto out_transport;
> +
>  	return 0;
>  
> +out_transport:
> +	virtio_transport_exit(&virtio_transport);
>  out_vci:
>  	vsock_core_unregister(&virtio_transport.transport);
>  out_wq:
> @@ -861,6 +873,7 @@ static void __exit virtio_vsock_exit(void)
>  {
>  	unregister_virtio_driver(&virtio_vsock_driver);
>  	vsock_core_unregister(&virtio_transport.transport);
> +	virtio_transport_exit(&virtio_transport);
>  	destroy_workqueue(virtio_vsock_workqueue);
>  }
>  
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index d5780599fe93..bdf16fff054f 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -16,6 +16,7 @@
>  
>  #include <net/sock.h>
>  #include <net/af_vsock.h>
> +#include <net/pkt_sched.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/vsock_virtio_transport_common.h>
> @@ -23,6 +24,93 @@
>  /* How long to wait for graceful shutdown of a connection */
>  #define VSOCK_CLOSE_TIMEOUT (8 * HZ)
>  
> +struct virtio_transport_priv {
> +	struct virtio_transport *trans;
> +};
> +
> +static netdev_tx_t virtio_transport_start_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct virtio_transport *t =
> +		((struct virtio_transport_priv *)netdev_priv(dev))->trans;
> +	int ret;
> +
> +	ret = t->send_pkt(skb);
> +	if (unlikely(ret == -ENODEV))
> +		return NETDEV_TX_BUSY;
> +
> +	return NETDEV_TX_OK;
> +}
> +
> +const struct net_device_ops virtio_transport_netdev_ops = {
> +	.ndo_start_xmit = virtio_transport_start_xmit,
> +};
> +
> +static void virtio_transport_setup(struct net_device *dev)
> +{
> +	dev->netdev_ops = &virtio_transport_netdev_ops;
> +	dev->needs_free_netdev = true;
> +	dev->flags = IFF_NOARP;
> +	dev->mtu = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
> +	dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
> +}
> +
> +static int ifup(struct net_device *dev)
> +{
> +	int ret;
> +
> +	rtnl_lock();
> +	ret = dev_open(dev, NULL) ? -ENOMEM : 0;
> +	rtnl_unlock();
> +
> +	return ret;
> +}
> +
> +/* virtio_transport_init - initialize a virtio vsock transport layer
> + *
> + * @t: ptr to the virtio transport struct to initialize
> + * @name: the name of the net_device to be created.
> + *
> + * Return 0 on success, otherwise negative errno.
> + */
> +int virtio_transport_init(struct virtio_transport *t, const char *name)
> +{
> +	struct virtio_transport_priv *priv;
> +	int ret;
> +
> +	t->dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, virtio_transport_setup);
> +	if (!t->dev)
> +		return -ENOMEM;
> +
> +	priv = netdev_priv(t->dev);
> +	priv->trans = t;
> +
> +	ret = register_netdev(t->dev);
> +	if (ret < 0)
> +		goto out_free_netdev;
> +
> +	ret = ifup(t->dev);
> +	if (ret < 0)
> +		goto out_unregister_netdev;
> +
> +	return 0;
> +
> +out_unregister_netdev:
> +	unregister_netdev(t->dev);
> +
> +out_free_netdev:
> +	free_netdev(t->dev);
> +
> +	return ret;
> +}
> +
> +void virtio_transport_exit(struct virtio_transport *t)
> +{
> +	if (t->dev) {
> +		unregister_netdev(t->dev);
> +		free_netdev(t->dev);
> +	}
> +}
> +
>  static const struct virtio_transport *
>  virtio_transport_get_ops(struct vsock_sock *vsk)
>  {
> @@ -147,6 +235,24 @@ static u16 virtio_transport_get_type(struct sock *sk)
>  		return VIRTIO_VSOCK_TYPE_SEQPACKET;
>  }
>  
> +/* Return pkt->len on success, otherwise negative errno */
> +static int virtio_transport_send_pkt(const struct virtio_transport *t, struct sk_buff *skb)
> +{
> +	int ret;
> +	int len = skb->len;
> +
> +	if (unlikely(!t->dev || !(t->dev->flags & IFF_UP)))
> +		return t->send_pkt(skb);
> +
> +	skb->dev = t->dev;
> +	ret = dev_queue_xmit(skb);
> +
> +	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN))
> +		return len;
> +
> +	return -ENOMEM;
> +}
> +
>  /* This function can only be used on connecting/connected sockets,
>   * since a socket assigned to a transport is required.
>   *
> @@ -202,9 +308,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>  
>  	virtio_transport_inc_tx_pkt(vvs, skb);
>  
> -	err = t_ops->send_pkt(skb);
> -
> -	return err < 0 ? -ENOMEM : err;
> +	return virtio_transport_send_pkt(t_ops, skb);
>  }
>  
>  static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
> @@ -834,7 +938,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
>  		return -ENOTCONN;
>  	}
>  
> -	return t->send_pkt(reply);
> +	return virtio_transport_send_pkt(t, reply);
>  }
>  
>  /* This function should be called with sk_lock held and SOCK_DONE set */
> -- 
> 2.35.1
Jakub Kicinski Aug. 16, 2022, 6:07 p.m. UTC | #7
On Tue, 16 Aug 2022 12:38:52 -0400 Michael S. Tsirkin wrote:
> On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote:
> > In order to support usage of qdisc on vsock traffic, this commit
> > introduces a struct net_device to vhost and virtio vsock.
> > 
> > Two new devices are created, vhost-vsock for vhost and virtio-vsock
> > for virtio. The devices are attached to the respective transports.
> > 
> > To bypass the usage of the device, the user may "down" the associated
> > network interface using common tools. For example, "ip link set dev
> > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely,
> > simply using the FIFO logic of the prior implementation.  
> 
> Ugh. That's quite a hack. Mark my words, at some point we will decide to
> have down mean "down".  Besides, multiple net devices with link up tend
> to confuse userspace. So might want to keep it down at all times
> even short term.

Agreed!

From a cursory look (and Documentation/ would be nice..) it feels
very wrong to me. Do you know of any uses of a netdev which would 
be semantically similar to what you're doing? Treating netdevs as
buildings blocks for arbitrary message passing solutions is something 
I dislike quite strongly.

Could you recommend where I can learn more about vsocks?
Jakub Kicinski Aug. 16, 2022, 11:07 p.m. UTC | #8
On Tue, 16 Aug 2022 07:02:33 +0000 Bobby Eshleman wrote:
> > From a cursory look (and Documentation/ would be nice..) it feels
> > very wrong to me. Do you know of any uses of a netdev which would 
> > be semantically similar to what you're doing? Treating netdevs as
> > buildings blocks for arbitrary message passing solutions is something 
> > I dislike quite strongly.  
> 
> The big difference between vsock and "arbitrary message passing" is that
> vsock is actually constrained by the virtio device that backs it (made
> up of virtqueues and the underlying protocol). That virtqueue pair is
> acting like the queues on a physical NIC, so it actually makes sense to
> manage the queuing of vsock's device like we would manage the queueing
> of a real device.
> 
> Still, I concede that ignoring the netdev state is a probably bad idea.
> 
> That said, I also think that using packet scheduling in vsock is a good
> idea, and that ideally we can reuse Linux's already robust library of
> packet scheduling algorithms by introducing qdisc somehow.

We've been burnt in the past by people doing the "let me just pick
these useful pieces out of netdev" thing. Makes life hard both for
maintainers and users trying to make sense of the interfaces.

What comes to mind if you're just after queuing is that we already
bastardized the CoDel implementation (include/net/codel_impl.h).
If CoDel is good enough for you maybe that's the easiest way?
Although I suspect that you're after fairness not early drops.
Wireless folks use CoDel as a second layer queuing. (CC: Toke)

> > Could you recommend where I can learn more about vsocks?  
> 
> I think the spec is probably the best place to start[1].
> 
> [1]: https://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html

Eh, I was hoping it was a side channel of an existing virtio_net 
which is not the case. Given the zero-config requirement IDK if 
we'll be able to fit this into netdev semantics :(
Jakub Kicinski Aug. 17, 2022, 1:15 a.m. UTC | #9
On Tue, 16 Aug 2022 08:29:04 +0000 Bobby Eshleman wrote:
> > We've been burnt in the past by people doing the "let me just pick
> > these useful pieces out of netdev" thing. Makes life hard both for
> > maintainers and users trying to make sense of the interfaces.
> > 
> > What comes to mind if you're just after queuing is that we already
> > bastardized the CoDel implementation (include/net/codel_impl.h).
> > If CoDel is good enough for you maybe that's the easiest way?
> > Although I suspect that you're after fairness not early drops.
> > Wireless folks use CoDel as a second layer queuing. (CC: Toke)
> 
> That is certainly interesting to me. Sitting next to "codel_impl.h" is
> "include/net/fq_impl.h", and it looks like it may solve the datagram
> flooding issue. The downside to this approach is the baking of a
> specific policy into vsock... which I don't exactly love either.
> 
> I'm not seeing too many other of these qdisc bastardizations in
> include/net, are there any others that you are aware of?

Just what wireless uses (so codel and fq as you found out), nothing
else comes to mind.

> > Eh, I was hoping it was a side channel of an existing virtio_net 
> > which is not the case. Given the zero-config requirement IDK if 
> > we'll be able to fit this into netdev semantics :(  
> 
> It's certainly possible that it may not fit :/ I feel that it partially
> depends on what we mean by zero-config. Is it "no config required to
> have a working socket" or is it "no config required, but also no
> tuning/policy/etc... supported"?

The value of tuning vs confusion of a strange netdev floating around
in the system is hard to estimate upfront. 

The nice thing about using a built-in fq with no user visible knobs is
that there's no extra uAPI. We can always rip it out and replace later.
And it shouldn't be controversial, making the path to upstream smoother.
Cong Wang . Aug. 17, 2022, 1:23 a.m. UTC | #10
On Tue, Aug 16, 2022 at 4:07 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue, 16 Aug 2022 07:02:33 +0000 Bobby Eshleman wrote:
> > > From a cursory look (and Documentation/ would be nice..) it feels
> > > very wrong to me. Do you know of any uses of a netdev which would
> > > be semantically similar to what you're doing? Treating netdevs as
> > > buildings blocks for arbitrary message passing solutions is something
> > > I dislike quite strongly.
> >
> > The big difference between vsock and "arbitrary message passing" is that
> > vsock is actually constrained by the virtio device that backs it (made
> > up of virtqueues and the underlying protocol). That virtqueue pair is
> > acting like the queues on a physical NIC, so it actually makes sense to
> > manage the queuing of vsock's device like we would manage the queueing
> > of a real device.
> >
> > Still, I concede that ignoring the netdev state is a probably bad idea.
> >
> > That said, I also think that using packet scheduling in vsock is a good
> > idea, and that ideally we can reuse Linux's already robust library of
> > packet scheduling algorithms by introducing qdisc somehow.
>
> We've been burnt in the past by people doing the "let me just pick
> these useful pieces out of netdev" thing. Makes life hard both for
> maintainers and users trying to make sense of the interfaces.

I interpret this in a different way: we just believe "one size does
not fit all",
as most Linux kernel developers do. I am very surprised you don't.

Feel free to suggest any other ways, eventually you will need to
reimplement TC one way or the other.

If you think about it in another way, vsock is networking too, its name
contains a "sock", do I need to say more? :)

>
> What comes to mind if you're just after queuing is that we already
> bastardized the CoDel implementation (include/net/codel_impl.h).
> If CoDel is good enough for you maybe that's the easiest way?
> Although I suspect that you're after fairness not early drops.
> Wireless folks use CoDel as a second layer queuing. (CC: Toke)

What makes you believe CoDel fits all cases? If it really does, you
probably have to convince Toke to give up his idea on XDP map
as it would no longer make any sense. I don't see you raise such
an argument there... What makes you treat this differently with XDP
map? I am very curious about your thought process here. ;-)

Thanks.
Michael S. Tsirkin Aug. 17, 2022, 5:20 p.m. UTC | #11
On Tue, Aug 16, 2022 at 10:50:55AM +0000, Bobby Eshleman wrote:
> > > > Eh, I was hoping it was a side channel of an existing virtio_net 
> > > > which is not the case. Given the zero-config requirement IDK if 
> > > > we'll be able to fit this into netdev semantics :(  
> > > 
> > > It's certainly possible that it may not fit :/ I feel that it partially
> > > depends on what we mean by zero-config. Is it "no config required to
> > > have a working socket" or is it "no config required, but also no
> > > tuning/policy/etc... supported"?
> > 
> > The value of tuning vs confusion of a strange netdev floating around
> > in the system is hard to estimate upfront. 
> 
> I think "a strange netdev floating around" is a total
> mischaracterization... vsock is a networking device and it supports
> vsock networks. Sure, it is a virtual device and the routing is done in
> host software, but the same is true for virtio-net and VM-to-VM vlan.
> 
> This patch actually uses netdev for its intended purpose: to support and
> manage the transmission of packets via a network device to a network.
> 
> Furthermore, it actually prepares vsock to eliminate a "strange" use of
> a netdev. The netdev in vsockmon isn't even used to transmit
> packets, it's "floating around" for no other reason than it is needed to
> support packet capture, which vsock couldn't support because it didn't
> have a netdev.
> 
> Something smells when we are required to build workaround kernel modules
> that use netdev for ciphoning packets off to userspace, when we could
> instead be using netdev for its intended purpose and get the same and
> more benefit.

So what happens when userspace inevitably attempts to bind a raw
packet socket to this device? Assign it an IP? Set up some firewall
rules?

These things all need to be addressed before merging since they affect UAPI.


> > 
> > The nice thing about using a built-in fq with no user visible knobs is
> > that there's no extra uAPI. We can always rip it out and replace later.
> > And it shouldn't be controversial, making the path to upstream smoother.
> 
> The issue is that after pulling in fq for one kind of flow management,
> then as users observe other flow issues, we will need to re-implement
> pfifo, and then TBF, and then we need to build an interface to let users
> select one, and to choose queue sizes... and then after awhile we've
> needlessly re-implemented huge chunks of the tc system.
> 
> I don't see any good reason to restrict vsock users to using suboptimal
> and rigid queuing.
> 
> Thanks.
Jason Wang Aug. 18, 2022, 4:34 a.m. UTC | #12
在 2022/8/18 01:20, Michael S. Tsirkin 写道:
> On Tue, Aug 16, 2022 at 10:50:55AM +0000, Bobby Eshleman wrote:
>>>>> Eh, I was hoping it was a side channel of an existing virtio_net
>>>>> which is not the case. Given the zero-config requirement IDK if
>>>>> we'll be able to fit this into netdev semantics :(
>>>> It's certainly possible that it may not fit :/ I feel that it partially
>>>> depends on what we mean by zero-config. Is it "no config required to
>>>> have a working socket" or is it "no config required, but also no
>>>> tuning/policy/etc... supported"?
>>> The value of tuning vs confusion of a strange netdev floating around
>>> in the system is hard to estimate upfront.
>> I think "a strange netdev floating around" is a total
>> mischaracterization... vsock is a networking device and it supports
>> vsock networks. Sure, it is a virtual device and the routing is done in
>> host software, but the same is true for virtio-net and VM-to-VM vlan.
>>
>> This patch actually uses netdev for its intended purpose: to support and
>> manage the transmission of packets via a network device to a network.
>>
>> Furthermore, it actually prepares vsock to eliminate a "strange" use of
>> a netdev. The netdev in vsockmon isn't even used to transmit
>> packets, it's "floating around" for no other reason than it is needed to
>> support packet capture, which vsock couldn't support because it didn't
>> have a netdev.
>>
>> Something smells when we are required to build workaround kernel modules
>> that use netdev for ciphoning packets off to userspace, when we could
>> instead be using netdev for its intended purpose and get the same and
>> more benefit.
> So what happens when userspace inevitably attempts to bind a raw
> packet socket to this device? Assign it an IP? Set up some firewall
> rules?
>
> These things all need to be addressed before merging since they affect UAPI.


It's possible if

1) extend virtio-net to have vsock queues

2) present vsock device on top of virtio-net via e.g auxiliary bus

Then raw socket still work at ethernet level while vsock works too.

The value is to share codes between the two type of devices (queues).

Thanks


>
>>> The nice thing about using a built-in fq with no user visible knobs is
>>> that there's no extra uAPI. We can always rip it out and replace later.
>>> And it shouldn't be controversial, making the path to upstream smoother.
>> The issue is that after pulling in fq for one kind of flow management,
>> then as users observe other flow issues, we will need to re-implement
>> pfifo, and then TBF, and then we need to build an interface to let users
>> select one, and to choose queue sizes... and then after awhile we've
>> needlessly re-implemented huge chunks of the tc system.
>>
>> I don't see any good reason to restrict vsock users to using suboptimal
>> and rigid queuing.
>>
>> Thanks.
Bobby Eshleman Aug. 18, 2022, 2:20 p.m. UTC | #13
On Tue, Sep 06, 2022 at 06:58:32AM -0400, Michael S. Tsirkin wrote:
> On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote:
> > In order to support usage of qdisc on vsock traffic, this commit
> > introduces a struct net_device to vhost and virtio vsock.
> > 
> > Two new devices are created, vhost-vsock for vhost and virtio-vsock
> > for virtio. The devices are attached to the respective transports.
> > 
> > To bypass the usage of the device, the user may "down" the associated
> > network interface using common tools. For example, "ip link set dev
> > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely,
> > simply using the FIFO logic of the prior implementation.
> > 
> > For both hosts and guests, there is one device for all G2H vsock sockets
> > and one device for all H2G vsock sockets. This makes sense for guests
> > because the driver only supports a single vsock channel (one pair of
> > TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not
> > seem ideal for some workloads. However, it is possible to use a
> > multi-queue qdisc, where a given queue is responsible for a range of
> > sockets. This seems to be a better solution than having one device per
> > socket, which may yield a very large number of devices and qdiscs, all
> > of which are dynamically being created and destroyed. Because of this
> > dynamism, it would also require a complex policy management daemon, as
> > devices would constantly be spun up and down as sockets were created and
> > destroyed. To avoid this, one device and qdisc also applies to all H2G
> > sockets.
> > 
> > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
> 
> 
> I've been thinking about this generally. vsock currently
> assumes reliability, but with qdisc can't we get
> packet drops e.g. depending on the queueing?
> 
> What prevents user from configuring such a discipline?
> One thing people like about vsock is that it's very hard
> to break H2G communication even with misconfigured
> networking.
> 

If qdisc decides to discard a packet, it returns NET_XMIT_CN via
dev_queue_xmit(). This v1 allows this quietly, but v2 could return
an error to the user (-ENOMEM or maybe -ENOBUFS) when this happens, similar
to when vsock is unable to enqueue a packet currently.

The user could still, for example, choose the noop qdisc. Assuming the
v2 change mentioned above, their sendmsg() calls will return errors.
Similar to how if they choose the wrong CID they will get an error when
connecting a socket.

Best,
Bobby
Michael S. Tsirkin Sept. 6, 2022, 10:58 a.m. UTC | #14
On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote:
> In order to support usage of qdisc on vsock traffic, this commit
> introduces a struct net_device to vhost and virtio vsock.
> 
> Two new devices are created, vhost-vsock for vhost and virtio-vsock
> for virtio. The devices are attached to the respective transports.
> 
> To bypass the usage of the device, the user may "down" the associated
> network interface using common tools. For example, "ip link set dev
> virtio-vsock down" lets vsock bypass the net_device and qdisc entirely,
> simply using the FIFO logic of the prior implementation.
> 
> For both hosts and guests, there is one device for all G2H vsock sockets
> and one device for all H2G vsock sockets. This makes sense for guests
> because the driver only supports a single vsock channel (one pair of
> TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not
> seem ideal for some workloads. However, it is possible to use a
> multi-queue qdisc, where a given queue is responsible for a range of
> sockets. This seems to be a better solution than having one device per
> socket, which may yield a very large number of devices and qdiscs, all
> of which are dynamically being created and destroyed. Because of this
> dynamism, it would also require a complex policy management daemon, as
> devices would constantly be spun up and down as sockets were created and
> destroyed. To avoid this, one device and qdisc also applies to all H2G
> sockets.
> 
> Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>


I've been thinking about this generally. vsock currently
assumes reliability, but with qdisc can't we get
packet drops e.g. depending on the queueing?

What prevents user from configuring such a discipline?
One thing people like about vsock is that it's very hard
to break H2G communication even with misconfigured
networking.
diff mbox series

Patch

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f8601d93d94d..b20ddec2664b 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -927,13 +927,30 @@  static int __init vhost_vsock_init(void)
 				  VSOCK_TRANSPORT_F_H2G);
 	if (ret < 0)
 		return ret;
-	return misc_register(&vhost_vsock_misc);
+
+	ret = virtio_transport_init(&vhost_transport, "vhost-vsock");
+	if (ret < 0)
+		goto out_unregister;
+
+	ret = misc_register(&vhost_vsock_misc);
+	if (ret < 0)
+		goto out_transport_exit;
+	return ret;
+
+out_transport_exit:
+	virtio_transport_exit(&vhost_transport);
+
+out_unregister:
+	vsock_core_unregister(&vhost_transport.transport);
+	return ret;
+
 };
 
 static void __exit vhost_vsock_exit(void)
 {
 	misc_deregister(&vhost_vsock_misc);
 	vsock_core_unregister(&vhost_transport.transport);
+	virtio_transport_exit(&vhost_transport);
 };
 
 module_init(vhost_vsock_init);
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 9a37eddbb87a..5d7e7fbd75f8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -91,10 +91,20 @@  struct virtio_transport {
 	/* This must be the first field */
 	struct vsock_transport transport;
 
+	/* Used almost exclusively for qdisc */
+	struct net_device *dev;
+
 	/* Takes ownership of the packet */
 	int (*send_pkt)(struct sk_buff *skb);
 };
 
+int
+virtio_transport_init(struct virtio_transport *t,
+		      const char *name);
+
+void
+virtio_transport_exit(struct virtio_transport *t);
+
 ssize_t
 virtio_transport_stream_dequeue(struct vsock_sock *vsk,
 				struct msghdr *msg,
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 3bb293fd8607..c6212eb38d3c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -131,7 +131,9 @@  virtio_transport_send_pkt_work(struct work_struct *work)
 		 * the vq
 		 */
 		if (ret < 0) {
-			skb_queue_head(&vsock->send_pkt_queue, skb);
+			spin_lock_bh(&vsock->send_pkt_queue.lock);
+			__skb_queue_head(&vsock->send_pkt_queue, skb);
+			spin_unlock_bh(&vsock->send_pkt_queue.lock);
 			break;
 		}
 
@@ -676,7 +678,9 @@  static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
 		kfree_skb(skb);
 	mutex_unlock(&vsock->tx_lock);
 
-	skb_queue_purge(&vsock->send_pkt_queue);
+	spin_lock_bh(&vsock->send_pkt_queue.lock);
+	__skb_queue_purge(&vsock->send_pkt_queue);
+	spin_unlock_bh(&vsock->send_pkt_queue.lock);
 
 	/* Delete virtqueues and flush outstanding callbacks if any */
 	vdev->config->del_vqs(vdev);
@@ -760,6 +764,8 @@  static void virtio_vsock_remove(struct virtio_device *vdev)
 	flush_work(&vsock->event_work);
 	flush_work(&vsock->send_pkt_work);
 
+	virtio_transport_exit(&virtio_transport);
+
 	mutex_unlock(&the_virtio_vsock_mutex);
 
 	kfree(vsock);
@@ -844,12 +850,18 @@  static int __init virtio_vsock_init(void)
 	if (ret)
 		goto out_wq;
 
-	ret = register_virtio_driver(&virtio_vsock_driver);
+	ret = virtio_transport_init(&virtio_transport, "virtio-vsock");
 	if (ret)
 		goto out_vci;
 
+	ret = register_virtio_driver(&virtio_vsock_driver);
+	if (ret)
+		goto out_transport;
+
 	return 0;
 
+out_transport:
+	virtio_transport_exit(&virtio_transport);
 out_vci:
 	vsock_core_unregister(&virtio_transport.transport);
 out_wq:
@@ -861,6 +873,7 @@  static void __exit virtio_vsock_exit(void)
 {
 	unregister_virtio_driver(&virtio_vsock_driver);
 	vsock_core_unregister(&virtio_transport.transport);
+	virtio_transport_exit(&virtio_transport);
 	destroy_workqueue(virtio_vsock_workqueue);
 }
 
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index d5780599fe93..bdf16fff054f 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -16,6 +16,7 @@ 
 
 #include <net/sock.h>
 #include <net/af_vsock.h>
+#include <net/pkt_sched.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/vsock_virtio_transport_common.h>
@@ -23,6 +24,93 @@ 
 /* How long to wait for graceful shutdown of a connection */
 #define VSOCK_CLOSE_TIMEOUT (8 * HZ)
 
+struct virtio_transport_priv {
+	struct virtio_transport *trans;
+};
+
+static netdev_tx_t virtio_transport_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct virtio_transport *t =
+		((struct virtio_transport_priv *)netdev_priv(dev))->trans;
+	int ret;
+
+	ret = t->send_pkt(skb);
+	if (unlikely(ret == -ENODEV))
+		return NETDEV_TX_BUSY;
+
+	return NETDEV_TX_OK;
+}
+
+const struct net_device_ops virtio_transport_netdev_ops = {
+	.ndo_start_xmit = virtio_transport_start_xmit,
+};
+
+static void virtio_transport_setup(struct net_device *dev)
+{
+	dev->netdev_ops = &virtio_transport_netdev_ops;
+	dev->needs_free_netdev = true;
+	dev->flags = IFF_NOARP;
+	dev->mtu = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+	dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
+}
+
+static int ifup(struct net_device *dev)
+{
+	int ret;
+
+	rtnl_lock();
+	ret = dev_open(dev, NULL) ? -ENOMEM : 0;
+	rtnl_unlock();
+
+	return ret;
+}
+
+/* virtio_transport_init - initialize a virtio vsock transport layer
+ *
+ * @t: ptr to the virtio transport struct to initialize
+ * @name: the name of the net_device to be created.
+ *
+ * Return 0 on success, otherwise negative errno.
+ */
+int virtio_transport_init(struct virtio_transport *t, const char *name)
+{
+	struct virtio_transport_priv *priv;
+	int ret;
+
+	t->dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, virtio_transport_setup);
+	if (!t->dev)
+		return -ENOMEM;
+
+	priv = netdev_priv(t->dev);
+	priv->trans = t;
+
+	ret = register_netdev(t->dev);
+	if (ret < 0)
+		goto out_free_netdev;
+
+	ret = ifup(t->dev);
+	if (ret < 0)
+		goto out_unregister_netdev;
+
+	return 0;
+
+out_unregister_netdev:
+	unregister_netdev(t->dev);
+
+out_free_netdev:
+	free_netdev(t->dev);
+
+	return ret;
+}
+
+void virtio_transport_exit(struct virtio_transport *t)
+{
+	if (t->dev) {
+		unregister_netdev(t->dev);
+		free_netdev(t->dev);
+	}
+}
+
 static const struct virtio_transport *
 virtio_transport_get_ops(struct vsock_sock *vsk)
 {
@@ -147,6 +235,24 @@  static u16 virtio_transport_get_type(struct sock *sk)
 		return VIRTIO_VSOCK_TYPE_SEQPACKET;
 }
 
+/* Return pkt->len on success, otherwise negative errno */
+static int virtio_transport_send_pkt(const struct virtio_transport *t, struct sk_buff *skb)
+{
+	int ret;
+	int len = skb->len;
+
+	if (unlikely(!t->dev || !(t->dev->flags & IFF_UP)))
+		return t->send_pkt(skb);
+
+	skb->dev = t->dev;
+	ret = dev_queue_xmit(skb);
+
+	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN))
+		return len;
+
+	return -ENOMEM;
+}
+
 /* This function can only be used on connecting/connected sockets,
  * since a socket assigned to a transport is required.
  *
@@ -202,9 +308,7 @@  static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 
 	virtio_transport_inc_tx_pkt(vvs, skb);
 
-	err = t_ops->send_pkt(skb);
-
-	return err < 0 ? -ENOMEM : err;
+	return virtio_transport_send_pkt(t_ops, skb);
 }
 
 static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
@@ -834,7 +938,7 @@  static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
 		return -ENOTCONN;
 	}
 
-	return t->send_pkt(reply);
+	return virtio_transport_send_pkt(t, reply);
 }
 
 /* This function should be called with sk_lock held and SOCK_DONE set */