Message ID | 5a93c5aad99d79f028d349cb7e3c128c65d5d7e2.1660362668.git.bobby.eshleman@bytedance.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | virtio/vsock: introduce dgrams, sk_buff, and qdisc | expand |
CC'ing virtio-dev@lists.oasis-open.org On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote: > In order to support usage of qdisc on vsock traffic, this commit > introduces a struct net_device to vhost and virtio vsock. > > Two new devices are created, vhost-vsock for vhost and virtio-vsock > for virtio. The devices are attached to the respective transports. > > To bypass the usage of the device, the user may "down" the associated > network interface using common tools. For example, "ip link set dev > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely, > simply using the FIFO logic of the prior implementation. > > For both hosts and guests, there is one device for all G2H vsock sockets > and one device for all H2G vsock sockets. This makes sense for guests > because the driver only supports a single vsock channel (one pair of > TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not > seem ideal for some workloads. However, it is possible to use a > multi-queue qdisc, where a given queue is responsible for a range of > sockets. This seems to be a better solution than having one device per > socket, which may yield a very large number of devices and qdiscs, all > of which are dynamically being created and destroyed. Because of this > dynamism, it would also require a complex policy management daemon, as > devices would constantly be spun up and down as sockets were created and > destroyed. To avoid this, one device and qdisc also applies to all H2G > sockets. > > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com> > --- > drivers/vhost/vsock.c | 19 +++- > include/linux/virtio_vsock.h | 10 +++ > net/vmw_vsock/virtio_transport.c | 19 +++- > net/vmw_vsock/virtio_transport_common.c | 112 +++++++++++++++++++++++- > 4 files changed, 152 insertions(+), 8 deletions(-) > > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c > index f8601d93d94d..b20ddec2664b 100644 > --- a/drivers/vhost/vsock.c > +++ b/drivers/vhost/vsock.c > @@ -927,13 +927,30 @@ static int __init vhost_vsock_init(void) > VSOCK_TRANSPORT_F_H2G); > if (ret < 0) > return ret; > - return misc_register(&vhost_vsock_misc); > + > + ret = virtio_transport_init(&vhost_transport, "vhost-vsock"); > + if (ret < 0) > + goto out_unregister; > + > + ret = misc_register(&vhost_vsock_misc); > + if (ret < 0) > + goto out_transport_exit; > + return ret; > + > +out_transport_exit: > + virtio_transport_exit(&vhost_transport); > + > +out_unregister: > + vsock_core_unregister(&vhost_transport.transport); > + return ret; > + > }; > > static void __exit vhost_vsock_exit(void) > { > misc_deregister(&vhost_vsock_misc); > vsock_core_unregister(&vhost_transport.transport); > + virtio_transport_exit(&vhost_transport); > }; > > module_init(vhost_vsock_init); > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h > index 9a37eddbb87a..5d7e7fbd75f8 100644 > --- a/include/linux/virtio_vsock.h > +++ b/include/linux/virtio_vsock.h > @@ -91,10 +91,20 @@ struct virtio_transport { > /* This must be the first field */ > struct vsock_transport transport; > > + /* Used almost exclusively for qdisc */ > + struct net_device *dev; > + > /* Takes ownership of the packet */ > int (*send_pkt)(struct sk_buff *skb); > }; > > +int > +virtio_transport_init(struct virtio_transport *t, > + const char *name); > + > +void > +virtio_transport_exit(struct virtio_transport *t); > + > ssize_t > virtio_transport_stream_dequeue(struct vsock_sock *vsk, > struct msghdr *msg, > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c > index 3bb293fd8607..c6212eb38d3c 100644 > --- a/net/vmw_vsock/virtio_transport.c > +++ b/net/vmw_vsock/virtio_transport.c > @@ -131,7 +131,9 @@ virtio_transport_send_pkt_work(struct work_struct *work) > * the vq > */ > if (ret < 0) { > - skb_queue_head(&vsock->send_pkt_queue, skb); > + spin_lock_bh(&vsock->send_pkt_queue.lock); > + __skb_queue_head(&vsock->send_pkt_queue, skb); > + spin_unlock_bh(&vsock->send_pkt_queue.lock); > break; > } > > @@ -676,7 +678,9 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) > kfree_skb(skb); > mutex_unlock(&vsock->tx_lock); > > - skb_queue_purge(&vsock->send_pkt_queue); > + spin_lock_bh(&vsock->send_pkt_queue.lock); > + __skb_queue_purge(&vsock->send_pkt_queue); > + spin_unlock_bh(&vsock->send_pkt_queue.lock); > > /* Delete virtqueues and flush outstanding callbacks if any */ > vdev->config->del_vqs(vdev); > @@ -760,6 +764,8 @@ static void virtio_vsock_remove(struct virtio_device *vdev) > flush_work(&vsock->event_work); > flush_work(&vsock->send_pkt_work); > > + virtio_transport_exit(&virtio_transport); > + > mutex_unlock(&the_virtio_vsock_mutex); > > kfree(vsock); > @@ -844,12 +850,18 @@ static int __init virtio_vsock_init(void) > if (ret) > goto out_wq; > > - ret = register_virtio_driver(&virtio_vsock_driver); > + ret = virtio_transport_init(&virtio_transport, "virtio-vsock"); > if (ret) > goto out_vci; > > + ret = register_virtio_driver(&virtio_vsock_driver); > + if (ret) > + goto out_transport; > + > return 0; > > +out_transport: > + virtio_transport_exit(&virtio_transport); > out_vci: > vsock_core_unregister(&virtio_transport.transport); > out_wq: > @@ -861,6 +873,7 @@ static void __exit virtio_vsock_exit(void) > { > unregister_virtio_driver(&virtio_vsock_driver); > vsock_core_unregister(&virtio_transport.transport); > + virtio_transport_exit(&virtio_transport); > destroy_workqueue(virtio_vsock_workqueue); > } > > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c > index d5780599fe93..bdf16fff054f 100644 > --- a/net/vmw_vsock/virtio_transport_common.c > +++ b/net/vmw_vsock/virtio_transport_common.c > @@ -16,6 +16,7 @@ > > #include <net/sock.h> > #include <net/af_vsock.h> > +#include <net/pkt_sched.h> > > #define CREATE_TRACE_POINTS > #include <trace/events/vsock_virtio_transport_common.h> > @@ -23,6 +24,93 @@ > /* How long to wait for graceful shutdown of a connection */ > #define VSOCK_CLOSE_TIMEOUT (8 * HZ) > > +struct virtio_transport_priv { > + struct virtio_transport *trans; > +}; > + > +static netdev_tx_t virtio_transport_start_xmit(struct sk_buff *skb, struct net_device *dev) > +{ > + struct virtio_transport *t = > + ((struct virtio_transport_priv *)netdev_priv(dev))->trans; > + int ret; > + > + ret = t->send_pkt(skb); > + if (unlikely(ret == -ENODEV)) > + return NETDEV_TX_BUSY; > + > + return NETDEV_TX_OK; > +} > + > +const struct net_device_ops virtio_transport_netdev_ops = { > + .ndo_start_xmit = virtio_transport_start_xmit, > +}; > + > +static void virtio_transport_setup(struct net_device *dev) > +{ > + dev->netdev_ops = &virtio_transport_netdev_ops; > + dev->needs_free_netdev = true; > + dev->flags = IFF_NOARP; > + dev->mtu = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; > + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; > +} > + > +static int ifup(struct net_device *dev) > +{ > + int ret; > + > + rtnl_lock(); > + ret = dev_open(dev, NULL) ? -ENOMEM : 0; > + rtnl_unlock(); > + > + return ret; > +} > + > +/* virtio_transport_init - initialize a virtio vsock transport layer > + * > + * @t: ptr to the virtio transport struct to initialize > + * @name: the name of the net_device to be created. > + * > + * Return 0 on success, otherwise negative errno. > + */ > +int virtio_transport_init(struct virtio_transport *t, const char *name) > +{ > + struct virtio_transport_priv *priv; > + int ret; > + > + t->dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, virtio_transport_setup); > + if (!t->dev) > + return -ENOMEM; > + > + priv = netdev_priv(t->dev); > + priv->trans = t; > + > + ret = register_netdev(t->dev); > + if (ret < 0) > + goto out_free_netdev; > + > + ret = ifup(t->dev); > + if (ret < 0) > + goto out_unregister_netdev; > + > + return 0; > + > +out_unregister_netdev: > + unregister_netdev(t->dev); > + > +out_free_netdev: > + free_netdev(t->dev); > + > + return ret; > +} > + > +void virtio_transport_exit(struct virtio_transport *t) > +{ > + if (t->dev) { > + unregister_netdev(t->dev); > + free_netdev(t->dev); > + } > +} > + > static const struct virtio_transport * > virtio_transport_get_ops(struct vsock_sock *vsk) > { > @@ -147,6 +235,24 @@ static u16 virtio_transport_get_type(struct sock *sk) > return VIRTIO_VSOCK_TYPE_SEQPACKET; > } > > +/* Return pkt->len on success, otherwise negative errno */ > +static int virtio_transport_send_pkt(const struct virtio_transport *t, struct sk_buff *skb) > +{ > + int ret; > + int len = skb->len; > + > + if (unlikely(!t->dev || !(t->dev->flags & IFF_UP))) > + return t->send_pkt(skb); > + > + skb->dev = t->dev; > + ret = dev_queue_xmit(skb); > + > + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) > + return len; > + > + return -ENOMEM; > +} > + > /* This function can only be used on connecting/connected sockets, > * since a socket assigned to a transport is required. > * > @@ -202,9 +308,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, > > virtio_transport_inc_tx_pkt(vvs, skb); > > - err = t_ops->send_pkt(skb); > - > - return err < 0 ? -ENOMEM : err; > + return virtio_transport_send_pkt(t_ops, skb); > } > > static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, > @@ -834,7 +938,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, > return -ENOTCONN; > } > > - return t->send_pkt(reply); > + return virtio_transport_send_pkt(t, reply); > } > > /* This function should be called with sk_lock held and SOCK_DONE set */ > -- > 2.35.1 >
On Tue, Aug 16, 2022 at 12:38:52PM -0400, Michael S. Tsirkin wrote: > On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote: > > In order to support usage of qdisc on vsock traffic, this commit > > introduces a struct net_device to vhost and virtio vsock. > > > > Two new devices are created, vhost-vsock for vhost and virtio-vsock > > for virtio. The devices are attached to the respective transports. > > > > To bypass the usage of the device, the user may "down" the associated > > network interface using common tools. For example, "ip link set dev > > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely, > > simply using the FIFO logic of the prior implementation. > > Ugh. That's quite a hack. Mark my words, at some point we will decide to > have down mean "down". Besides, multiple net devices with link up tend > to confuse userspace. So might want to keep it down at all times > even short term. > I have to admit, this choice was born more of perceived necessity than of a love for the design... but I can explain the pain points that led to the current state, which I hope sparks some discussion. When the state is down, dev_queue_xmit() will fail. To avoid this and preserve the "zero-configuration" guarantee of vsock, I chose to make transmission work regardless of device state by implementing this "ignore up/down state" hack. This is unfortunate because what we are really after here is just packet scheduling, i.e., qdisc. We don't really need the rest of the net_device, and I don't think up/down buys us anything of value. The relationship between qdisc and net_device is so tightly knit together though, that using qdisc without a net_device doesn't look very practical (and maybe impossible). Some alternative routes might be: 1) Default the state to up, and let users disable vsock by downing the device if they'd like. It still works out-of-the-box, but if users really want to disable vsock they may. 2) vsock may simply turn the device to state up when a socket is first used. For instance, the HCI device in net/bluetooth/hci_* uses a technique where the net_device is turned to up when bind() is called on any HCI socket (BTPROTO_HCI). It can also be turned up/down via ioctl(). 3) Modify net_device registration to allow us to have an invisible device that is only known by the kernel. It may default to up and remain unchanged. The qdisc controls alone may be exposed to userspace, hopefully via netlink to still work with tc. This is not currently supported by register_netdevice(), but a series from 2007 was sent to the ML, tentatively approved in concept, but was never merged[1]. 4) Currently NETDEV_UP/NETDEV_DOWN commands can't be vetoed. NETDEV_PRE_UP, however, is used to effectively veto NETDEV_UP commands[2]. We could introduce NETDEV_PRE_DOWN to support vetoing of NETDEV_DOWN. This would allow us to install a hook to determine if we actually want to allow the device to be downed. In an ideal world, we could simply pass a set of netdev queues, a packet, and maybe a blob of state to qdisc and let it work its scheduling magic... Any thoughts? [1]: https://lore.kernel.org/netdev/20070129140958.0cf6880f@freekitty/ [2]: https://lore.kernel.org/all/20090529.220906.243061042.davem@davemloft.net/ Thanks, Bobby
On Tue, Aug 16, 2022 at 11:07:17AM -0700, Jakub Kicinski wrote: > On Tue, 16 Aug 2022 12:38:52 -0400 Michael S. Tsirkin wrote: > > On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote: > > > In order to support usage of qdisc on vsock traffic, this commit > > > introduces a struct net_device to vhost and virtio vsock. > > > > > > Two new devices are created, vhost-vsock for vhost and virtio-vsock > > > for virtio. The devices are attached to the respective transports. > > > > > > To bypass the usage of the device, the user may "down" the associated > > > network interface using common tools. For example, "ip link set dev > > > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely, > > > simply using the FIFO logic of the prior implementation. > > > > Ugh. That's quite a hack. Mark my words, at some point we will decide to > > have down mean "down". Besides, multiple net devices with link up tend > > to confuse userspace. So might want to keep it down at all times > > even short term. > > Agreed! > > From a cursory look (and Documentation/ would be nice..) it feels > very wrong to me. Do you know of any uses of a netdev which would > be semantically similar to what you're doing? Treating netdevs as > buildings blocks for arbitrary message passing solutions is something > I dislike quite strongly. The big difference between vsock and "arbitrary message passing" is that vsock is actually constrained by the virtio device that backs it (made up of virtqueues and the underlying protocol). That virtqueue pair is acting like the queues on a physical NIC, so it actually makes sense to manage the queuing of vsock's device like we would manage the queueing of a real device. Still, I concede that ignoring the netdev state is a probably bad idea. That said, I also think that using packet scheduling in vsock is a good idea, and that ideally we can reuse Linux's already robust library of packet scheduling algorithms by introducing qdisc somehow. > > Could you recommend where I can learn more about vsocks? I think the spec is probably the best place to start[1]. [1]: https://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html Best, Bobby
On Tue, Aug 16, 2022 at 04:07:55PM -0700, Jakub Kicinski wrote: > On Tue, 16 Aug 2022 07:02:33 +0000 Bobby Eshleman wrote: > > > From a cursory look (and Documentation/ would be nice..) it feels > > > very wrong to me. Do you know of any uses of a netdev which would > > > be semantically similar to what you're doing? Treating netdevs as > > > buildings blocks for arbitrary message passing solutions is something > > > I dislike quite strongly. > > > > The big difference between vsock and "arbitrary message passing" is that > > vsock is actually constrained by the virtio device that backs it (made > > up of virtqueues and the underlying protocol). That virtqueue pair is > > acting like the queues on a physical NIC, so it actually makes sense to > > manage the queuing of vsock's device like we would manage the queueing > > of a real device. > > > > Still, I concede that ignoring the netdev state is a probably bad idea. > > > > That said, I also think that using packet scheduling in vsock is a good > > idea, and that ideally we can reuse Linux's already robust library of > > packet scheduling algorithms by introducing qdisc somehow. > > We've been burnt in the past by people doing the "let me just pick > these useful pieces out of netdev" thing. Makes life hard both for > maintainers and users trying to make sense of the interfaces. > > What comes to mind if you're just after queuing is that we already > bastardized the CoDel implementation (include/net/codel_impl.h). > If CoDel is good enough for you maybe that's the easiest way? > Although I suspect that you're after fairness not early drops. > Wireless folks use CoDel as a second layer queuing. (CC: Toke) > That is certainly interesting to me. Sitting next to "codel_impl.h" is "include/net/fq_impl.h", and it looks like it may solve the datagram flooding issue. The downside to this approach is the baking of a specific policy into vsock... which I don't exactly love either. I'm not seeing too many other of these qdisc bastardizations in include/net, are there any others that you are aware of? > > > Could you recommend where I can learn more about vsocks? > > > > I think the spec is probably the best place to start[1]. > > > > [1]: https://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html > > Eh, I was hoping it was a side channel of an existing virtio_net > which is not the case. Given the zero-config requirement IDK if > we'll be able to fit this into netdev semantics :( It's certainly possible that it may not fit :/ I feel that it partially depends on what we mean by zero-config. Is it "no config required to have a working socket" or is it "no config required, but also no tuning/policy/etc... supported"? Best, Bobby
On Tue, Aug 16, 2022 at 06:15:28PM -0700, Jakub Kicinski wrote: > On Tue, 16 Aug 2022 08:29:04 +0000 Bobby Eshleman wrote: > > > We've been burnt in the past by people doing the "let me just pick > > > these useful pieces out of netdev" thing. Makes life hard both for > > > maintainers and users trying to make sense of the interfaces. > > > > > > What comes to mind if you're just after queuing is that we already > > > bastardized the CoDel implementation (include/net/codel_impl.h). > > > If CoDel is good enough for you maybe that's the easiest way? > > > Although I suspect that you're after fairness not early drops. > > > Wireless folks use CoDel as a second layer queuing. (CC: Toke) > > > > That is certainly interesting to me. Sitting next to "codel_impl.h" is > > "include/net/fq_impl.h", and it looks like it may solve the datagram > > flooding issue. The downside to this approach is the baking of a > > specific policy into vsock... which I don't exactly love either. > > > > I'm not seeing too many other of these qdisc bastardizations in > > include/net, are there any others that you are aware of? > > Just what wireless uses (so codel and fq as you found out), nothing > else comes to mind. > > > > Eh, I was hoping it was a side channel of an existing virtio_net > > > which is not the case. Given the zero-config requirement IDK if > > > we'll be able to fit this into netdev semantics :( > > > > It's certainly possible that it may not fit :/ I feel that it partially > > depends on what we mean by zero-config. Is it "no config required to > > have a working socket" or is it "no config required, but also no > > tuning/policy/etc... supported"? > > The value of tuning vs confusion of a strange netdev floating around > in the system is hard to estimate upfront. I think "a strange netdev floating around" is a total mischaracterization... vsock is a networking device and it supports vsock networks. Sure, it is a virtual device and the routing is done in host software, but the same is true for virtio-net and VM-to-VM vlan. This patch actually uses netdev for its intended purpose: to support and manage the transmission of packets via a network device to a network. Furthermore, it actually prepares vsock to eliminate a "strange" use of a netdev. The netdev in vsockmon isn't even used to transmit packets, it's "floating around" for no other reason than it is needed to support packet capture, which vsock couldn't support because it didn't have a netdev. Something smells when we are required to build workaround kernel modules that use netdev for ciphoning packets off to userspace, when we could instead be using netdev for its intended purpose and get the same and more benefit. > > The nice thing about using a built-in fq with no user visible knobs is > that there's no extra uAPI. We can always rip it out and replace later. > And it shouldn't be controversial, making the path to upstream smoother. The issue is that after pulling in fq for one kind of flow management, then as users observe other flow issues, we will need to re-implement pfifo, and then TBF, and then we need to build an interface to let users select one, and to choose queue sizes... and then after awhile we've needlessly re-implemented huge chunks of the tc system. I don't see any good reason to restrict vsock users to using suboptimal and rigid queuing. Thanks.
On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote: > In order to support usage of qdisc on vsock traffic, this commit > introduces a struct net_device to vhost and virtio vsock. > > Two new devices are created, vhost-vsock for vhost and virtio-vsock > for virtio. The devices are attached to the respective transports. > > To bypass the usage of the device, the user may "down" the associated > network interface using common tools. For example, "ip link set dev > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely, > simply using the FIFO logic of the prior implementation. Ugh. That's quite a hack. Mark my words, at some point we will decide to have down mean "down". Besides, multiple net devices with link up tend to confuse userspace. So might want to keep it down at all times even short term. > For both hosts and guests, there is one device for all G2H vsock sockets > and one device for all H2G vsock sockets. This makes sense for guests > because the driver only supports a single vsock channel (one pair of > TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not > seem ideal for some workloads. However, it is possible to use a > multi-queue qdisc, where a given queue is responsible for a range of > sockets. This seems to be a better solution than having one device per > socket, which may yield a very large number of devices and qdiscs, all > of which are dynamically being created and destroyed. Because of this > dynamism, it would also require a complex policy management daemon, as > devices would constantly be spun up and down as sockets were created and > destroyed. To avoid this, one device and qdisc also applies to all H2G > sockets. > > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com> > --- > drivers/vhost/vsock.c | 19 +++- > include/linux/virtio_vsock.h | 10 +++ > net/vmw_vsock/virtio_transport.c | 19 +++- > net/vmw_vsock/virtio_transport_common.c | 112 +++++++++++++++++++++++- > 4 files changed, 152 insertions(+), 8 deletions(-) > > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c > index f8601d93d94d..b20ddec2664b 100644 > --- a/drivers/vhost/vsock.c > +++ b/drivers/vhost/vsock.c > @@ -927,13 +927,30 @@ static int __init vhost_vsock_init(void) > VSOCK_TRANSPORT_F_H2G); > if (ret < 0) > return ret; > - return misc_register(&vhost_vsock_misc); > + > + ret = virtio_transport_init(&vhost_transport, "vhost-vsock"); > + if (ret < 0) > + goto out_unregister; > + > + ret = misc_register(&vhost_vsock_misc); > + if (ret < 0) > + goto out_transport_exit; > + return ret; > + > +out_transport_exit: > + virtio_transport_exit(&vhost_transport); > + > +out_unregister: > + vsock_core_unregister(&vhost_transport.transport); > + return ret; > + > }; > > static void __exit vhost_vsock_exit(void) > { > misc_deregister(&vhost_vsock_misc); > vsock_core_unregister(&vhost_transport.transport); > + virtio_transport_exit(&vhost_transport); > }; > > module_init(vhost_vsock_init); > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h > index 9a37eddbb87a..5d7e7fbd75f8 100644 > --- a/include/linux/virtio_vsock.h > +++ b/include/linux/virtio_vsock.h > @@ -91,10 +91,20 @@ struct virtio_transport { > /* This must be the first field */ > struct vsock_transport transport; > > + /* Used almost exclusively for qdisc */ > + struct net_device *dev; > + > /* Takes ownership of the packet */ > int (*send_pkt)(struct sk_buff *skb); > }; > > +int > +virtio_transport_init(struct virtio_transport *t, > + const char *name); > + > +void > +virtio_transport_exit(struct virtio_transport *t); > + > ssize_t > virtio_transport_stream_dequeue(struct vsock_sock *vsk, > struct msghdr *msg, > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c > index 3bb293fd8607..c6212eb38d3c 100644 > --- a/net/vmw_vsock/virtio_transport.c > +++ b/net/vmw_vsock/virtio_transport.c > @@ -131,7 +131,9 @@ virtio_transport_send_pkt_work(struct work_struct *work) > * the vq > */ > if (ret < 0) { > - skb_queue_head(&vsock->send_pkt_queue, skb); > + spin_lock_bh(&vsock->send_pkt_queue.lock); > + __skb_queue_head(&vsock->send_pkt_queue, skb); > + spin_unlock_bh(&vsock->send_pkt_queue.lock); > break; > } > > @@ -676,7 +678,9 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) > kfree_skb(skb); > mutex_unlock(&vsock->tx_lock); > > - skb_queue_purge(&vsock->send_pkt_queue); > + spin_lock_bh(&vsock->send_pkt_queue.lock); > + __skb_queue_purge(&vsock->send_pkt_queue); > + spin_unlock_bh(&vsock->send_pkt_queue.lock); > > /* Delete virtqueues and flush outstanding callbacks if any */ > vdev->config->del_vqs(vdev); > @@ -760,6 +764,8 @@ static void virtio_vsock_remove(struct virtio_device *vdev) > flush_work(&vsock->event_work); > flush_work(&vsock->send_pkt_work); > > + virtio_transport_exit(&virtio_transport); > + > mutex_unlock(&the_virtio_vsock_mutex); > > kfree(vsock); > @@ -844,12 +850,18 @@ static int __init virtio_vsock_init(void) > if (ret) > goto out_wq; > > - ret = register_virtio_driver(&virtio_vsock_driver); > + ret = virtio_transport_init(&virtio_transport, "virtio-vsock"); > if (ret) > goto out_vci; > > + ret = register_virtio_driver(&virtio_vsock_driver); > + if (ret) > + goto out_transport; > + > return 0; > > +out_transport: > + virtio_transport_exit(&virtio_transport); > out_vci: > vsock_core_unregister(&virtio_transport.transport); > out_wq: > @@ -861,6 +873,7 @@ static void __exit virtio_vsock_exit(void) > { > unregister_virtio_driver(&virtio_vsock_driver); > vsock_core_unregister(&virtio_transport.transport); > + virtio_transport_exit(&virtio_transport); > destroy_workqueue(virtio_vsock_workqueue); > } > > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c > index d5780599fe93..bdf16fff054f 100644 > --- a/net/vmw_vsock/virtio_transport_common.c > +++ b/net/vmw_vsock/virtio_transport_common.c > @@ -16,6 +16,7 @@ > > #include <net/sock.h> > #include <net/af_vsock.h> > +#include <net/pkt_sched.h> > > #define CREATE_TRACE_POINTS > #include <trace/events/vsock_virtio_transport_common.h> > @@ -23,6 +24,93 @@ > /* How long to wait for graceful shutdown of a connection */ > #define VSOCK_CLOSE_TIMEOUT (8 * HZ) > > +struct virtio_transport_priv { > + struct virtio_transport *trans; > +}; > + > +static netdev_tx_t virtio_transport_start_xmit(struct sk_buff *skb, struct net_device *dev) > +{ > + struct virtio_transport *t = > + ((struct virtio_transport_priv *)netdev_priv(dev))->trans; > + int ret; > + > + ret = t->send_pkt(skb); > + if (unlikely(ret == -ENODEV)) > + return NETDEV_TX_BUSY; > + > + return NETDEV_TX_OK; > +} > + > +const struct net_device_ops virtio_transport_netdev_ops = { > + .ndo_start_xmit = virtio_transport_start_xmit, > +}; > + > +static void virtio_transport_setup(struct net_device *dev) > +{ > + dev->netdev_ops = &virtio_transport_netdev_ops; > + dev->needs_free_netdev = true; > + dev->flags = IFF_NOARP; > + dev->mtu = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; > + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; > +} > + > +static int ifup(struct net_device *dev) > +{ > + int ret; > + > + rtnl_lock(); > + ret = dev_open(dev, NULL) ? -ENOMEM : 0; > + rtnl_unlock(); > + > + return ret; > +} > + > +/* virtio_transport_init - initialize a virtio vsock transport layer > + * > + * @t: ptr to the virtio transport struct to initialize > + * @name: the name of the net_device to be created. > + * > + * Return 0 on success, otherwise negative errno. > + */ > +int virtio_transport_init(struct virtio_transport *t, const char *name) > +{ > + struct virtio_transport_priv *priv; > + int ret; > + > + t->dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, virtio_transport_setup); > + if (!t->dev) > + return -ENOMEM; > + > + priv = netdev_priv(t->dev); > + priv->trans = t; > + > + ret = register_netdev(t->dev); > + if (ret < 0) > + goto out_free_netdev; > + > + ret = ifup(t->dev); > + if (ret < 0) > + goto out_unregister_netdev; > + > + return 0; > + > +out_unregister_netdev: > + unregister_netdev(t->dev); > + > +out_free_netdev: > + free_netdev(t->dev); > + > + return ret; > +} > + > +void virtio_transport_exit(struct virtio_transport *t) > +{ > + if (t->dev) { > + unregister_netdev(t->dev); > + free_netdev(t->dev); > + } > +} > + > static const struct virtio_transport * > virtio_transport_get_ops(struct vsock_sock *vsk) > { > @@ -147,6 +235,24 @@ static u16 virtio_transport_get_type(struct sock *sk) > return VIRTIO_VSOCK_TYPE_SEQPACKET; > } > > +/* Return pkt->len on success, otherwise negative errno */ > +static int virtio_transport_send_pkt(const struct virtio_transport *t, struct sk_buff *skb) > +{ > + int ret; > + int len = skb->len; > + > + if (unlikely(!t->dev || !(t->dev->flags & IFF_UP))) > + return t->send_pkt(skb); > + > + skb->dev = t->dev; > + ret = dev_queue_xmit(skb); > + > + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) > + return len; > + > + return -ENOMEM; > +} > + > /* This function can only be used on connecting/connected sockets, > * since a socket assigned to a transport is required. > * > @@ -202,9 +308,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, > > virtio_transport_inc_tx_pkt(vvs, skb); > > - err = t_ops->send_pkt(skb); > - > - return err < 0 ? -ENOMEM : err; > + return virtio_transport_send_pkt(t_ops, skb); > } > > static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, > @@ -834,7 +938,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, > return -ENOTCONN; > } > > - return t->send_pkt(reply); > + return virtio_transport_send_pkt(t, reply); > } > > /* This function should be called with sk_lock held and SOCK_DONE set */ > -- > 2.35.1
On Tue, 16 Aug 2022 12:38:52 -0400 Michael S. Tsirkin wrote: > On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote: > > In order to support usage of qdisc on vsock traffic, this commit > > introduces a struct net_device to vhost and virtio vsock. > > > > Two new devices are created, vhost-vsock for vhost and virtio-vsock > > for virtio. The devices are attached to the respective transports. > > > > To bypass the usage of the device, the user may "down" the associated > > network interface using common tools. For example, "ip link set dev > > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely, > > simply using the FIFO logic of the prior implementation. > > Ugh. That's quite a hack. Mark my words, at some point we will decide to > have down mean "down". Besides, multiple net devices with link up tend > to confuse userspace. So might want to keep it down at all times > even short term. Agreed! From a cursory look (and Documentation/ would be nice..) it feels very wrong to me. Do you know of any uses of a netdev which would be semantically similar to what you're doing? Treating netdevs as buildings blocks for arbitrary message passing solutions is something I dislike quite strongly. Could you recommend where I can learn more about vsocks?
On Tue, 16 Aug 2022 07:02:33 +0000 Bobby Eshleman wrote: > > From a cursory look (and Documentation/ would be nice..) it feels > > very wrong to me. Do you know of any uses of a netdev which would > > be semantically similar to what you're doing? Treating netdevs as > > buildings blocks for arbitrary message passing solutions is something > > I dislike quite strongly. > > The big difference between vsock and "arbitrary message passing" is that > vsock is actually constrained by the virtio device that backs it (made > up of virtqueues and the underlying protocol). That virtqueue pair is > acting like the queues on a physical NIC, so it actually makes sense to > manage the queuing of vsock's device like we would manage the queueing > of a real device. > > Still, I concede that ignoring the netdev state is a probably bad idea. > > That said, I also think that using packet scheduling in vsock is a good > idea, and that ideally we can reuse Linux's already robust library of > packet scheduling algorithms by introducing qdisc somehow. We've been burnt in the past by people doing the "let me just pick these useful pieces out of netdev" thing. Makes life hard both for maintainers and users trying to make sense of the interfaces. What comes to mind if you're just after queuing is that we already bastardized the CoDel implementation (include/net/codel_impl.h). If CoDel is good enough for you maybe that's the easiest way? Although I suspect that you're after fairness not early drops. Wireless folks use CoDel as a second layer queuing. (CC: Toke) > > Could you recommend where I can learn more about vsocks? > > I think the spec is probably the best place to start[1]. > > [1]: https://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html Eh, I was hoping it was a side channel of an existing virtio_net which is not the case. Given the zero-config requirement IDK if we'll be able to fit this into netdev semantics :(
On Tue, 16 Aug 2022 08:29:04 +0000 Bobby Eshleman wrote: > > We've been burnt in the past by people doing the "let me just pick > > these useful pieces out of netdev" thing. Makes life hard both for > > maintainers and users trying to make sense of the interfaces. > > > > What comes to mind if you're just after queuing is that we already > > bastardized the CoDel implementation (include/net/codel_impl.h). > > If CoDel is good enough for you maybe that's the easiest way? > > Although I suspect that you're after fairness not early drops. > > Wireless folks use CoDel as a second layer queuing. (CC: Toke) > > That is certainly interesting to me. Sitting next to "codel_impl.h" is > "include/net/fq_impl.h", and it looks like it may solve the datagram > flooding issue. The downside to this approach is the baking of a > specific policy into vsock... which I don't exactly love either. > > I'm not seeing too many other of these qdisc bastardizations in > include/net, are there any others that you are aware of? Just what wireless uses (so codel and fq as you found out), nothing else comes to mind. > > Eh, I was hoping it was a side channel of an existing virtio_net > > which is not the case. Given the zero-config requirement IDK if > > we'll be able to fit this into netdev semantics :( > > It's certainly possible that it may not fit :/ I feel that it partially > depends on what we mean by zero-config. Is it "no config required to > have a working socket" or is it "no config required, but also no > tuning/policy/etc... supported"? The value of tuning vs confusion of a strange netdev floating around in the system is hard to estimate upfront. The nice thing about using a built-in fq with no user visible knobs is that there's no extra uAPI. We can always rip it out and replace later. And it shouldn't be controversial, making the path to upstream smoother.
On Tue, Aug 16, 2022 at 4:07 PM Jakub Kicinski <kuba@kernel.org> wrote: > > On Tue, 16 Aug 2022 07:02:33 +0000 Bobby Eshleman wrote: > > > From a cursory look (and Documentation/ would be nice..) it feels > > > very wrong to me. Do you know of any uses of a netdev which would > > > be semantically similar to what you're doing? Treating netdevs as > > > buildings blocks for arbitrary message passing solutions is something > > > I dislike quite strongly. > > > > The big difference between vsock and "arbitrary message passing" is that > > vsock is actually constrained by the virtio device that backs it (made > > up of virtqueues and the underlying protocol). That virtqueue pair is > > acting like the queues on a physical NIC, so it actually makes sense to > > manage the queuing of vsock's device like we would manage the queueing > > of a real device. > > > > Still, I concede that ignoring the netdev state is a probably bad idea. > > > > That said, I also think that using packet scheduling in vsock is a good > > idea, and that ideally we can reuse Linux's already robust library of > > packet scheduling algorithms by introducing qdisc somehow. > > We've been burnt in the past by people doing the "let me just pick > these useful pieces out of netdev" thing. Makes life hard both for > maintainers and users trying to make sense of the interfaces. I interpret this in a different way: we just believe "one size does not fit all", as most Linux kernel developers do. I am very surprised you don't. Feel free to suggest any other ways, eventually you will need to reimplement TC one way or the other. If you think about it in another way, vsock is networking too, its name contains a "sock", do I need to say more? :) > > What comes to mind if you're just after queuing is that we already > bastardized the CoDel implementation (include/net/codel_impl.h). > If CoDel is good enough for you maybe that's the easiest way? > Although I suspect that you're after fairness not early drops. > Wireless folks use CoDel as a second layer queuing. (CC: Toke) What makes you believe CoDel fits all cases? If it really does, you probably have to convince Toke to give up his idea on XDP map as it would no longer make any sense. I don't see you raise such an argument there... What makes you treat this differently with XDP map? I am very curious about your thought process here. ;-) Thanks.
On Tue, Aug 16, 2022 at 10:50:55AM +0000, Bobby Eshleman wrote: > > > > Eh, I was hoping it was a side channel of an existing virtio_net > > > > which is not the case. Given the zero-config requirement IDK if > > > > we'll be able to fit this into netdev semantics :( > > > > > > It's certainly possible that it may not fit :/ I feel that it partially > > > depends on what we mean by zero-config. Is it "no config required to > > > have a working socket" or is it "no config required, but also no > > > tuning/policy/etc... supported"? > > > > The value of tuning vs confusion of a strange netdev floating around > > in the system is hard to estimate upfront. > > I think "a strange netdev floating around" is a total > mischaracterization... vsock is a networking device and it supports > vsock networks. Sure, it is a virtual device and the routing is done in > host software, but the same is true for virtio-net and VM-to-VM vlan. > > This patch actually uses netdev for its intended purpose: to support and > manage the transmission of packets via a network device to a network. > > Furthermore, it actually prepares vsock to eliminate a "strange" use of > a netdev. The netdev in vsockmon isn't even used to transmit > packets, it's "floating around" for no other reason than it is needed to > support packet capture, which vsock couldn't support because it didn't > have a netdev. > > Something smells when we are required to build workaround kernel modules > that use netdev for ciphoning packets off to userspace, when we could > instead be using netdev for its intended purpose and get the same and > more benefit. So what happens when userspace inevitably attempts to bind a raw packet socket to this device? Assign it an IP? Set up some firewall rules? These things all need to be addressed before merging since they affect UAPI. > > > > The nice thing about using a built-in fq with no user visible knobs is > > that there's no extra uAPI. We can always rip it out and replace later. > > And it shouldn't be controversial, making the path to upstream smoother. > > The issue is that after pulling in fq for one kind of flow management, > then as users observe other flow issues, we will need to re-implement > pfifo, and then TBF, and then we need to build an interface to let users > select one, and to choose queue sizes... and then after awhile we've > needlessly re-implemented huge chunks of the tc system. > > I don't see any good reason to restrict vsock users to using suboptimal > and rigid queuing. > > Thanks.
在 2022/8/18 01:20, Michael S. Tsirkin 写道: > On Tue, Aug 16, 2022 at 10:50:55AM +0000, Bobby Eshleman wrote: >>>>> Eh, I was hoping it was a side channel of an existing virtio_net >>>>> which is not the case. Given the zero-config requirement IDK if >>>>> we'll be able to fit this into netdev semantics :( >>>> It's certainly possible that it may not fit :/ I feel that it partially >>>> depends on what we mean by zero-config. Is it "no config required to >>>> have a working socket" or is it "no config required, but also no >>>> tuning/policy/etc... supported"? >>> The value of tuning vs confusion of a strange netdev floating around >>> in the system is hard to estimate upfront. >> I think "a strange netdev floating around" is a total >> mischaracterization... vsock is a networking device and it supports >> vsock networks. Sure, it is a virtual device and the routing is done in >> host software, but the same is true for virtio-net and VM-to-VM vlan. >> >> This patch actually uses netdev for its intended purpose: to support and >> manage the transmission of packets via a network device to a network. >> >> Furthermore, it actually prepares vsock to eliminate a "strange" use of >> a netdev. The netdev in vsockmon isn't even used to transmit >> packets, it's "floating around" for no other reason than it is needed to >> support packet capture, which vsock couldn't support because it didn't >> have a netdev. >> >> Something smells when we are required to build workaround kernel modules >> that use netdev for ciphoning packets off to userspace, when we could >> instead be using netdev for its intended purpose and get the same and >> more benefit. > So what happens when userspace inevitably attempts to bind a raw > packet socket to this device? Assign it an IP? Set up some firewall > rules? > > These things all need to be addressed before merging since they affect UAPI. It's possible if 1) extend virtio-net to have vsock queues 2) present vsock device on top of virtio-net via e.g auxiliary bus Then raw socket still work at ethernet level while vsock works too. The value is to share codes between the two type of devices (queues). Thanks > >>> The nice thing about using a built-in fq with no user visible knobs is >>> that there's no extra uAPI. We can always rip it out and replace later. >>> And it shouldn't be controversial, making the path to upstream smoother. >> The issue is that after pulling in fq for one kind of flow management, >> then as users observe other flow issues, we will need to re-implement >> pfifo, and then TBF, and then we need to build an interface to let users >> select one, and to choose queue sizes... and then after awhile we've >> needlessly re-implemented huge chunks of the tc system. >> >> I don't see any good reason to restrict vsock users to using suboptimal >> and rigid queuing. >> >> Thanks.
On Tue, Sep 06, 2022 at 06:58:32AM -0400, Michael S. Tsirkin wrote: > On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote: > > In order to support usage of qdisc on vsock traffic, this commit > > introduces a struct net_device to vhost and virtio vsock. > > > > Two new devices are created, vhost-vsock for vhost and virtio-vsock > > for virtio. The devices are attached to the respective transports. > > > > To bypass the usage of the device, the user may "down" the associated > > network interface using common tools. For example, "ip link set dev > > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely, > > simply using the FIFO logic of the prior implementation. > > > > For both hosts and guests, there is one device for all G2H vsock sockets > > and one device for all H2G vsock sockets. This makes sense for guests > > because the driver only supports a single vsock channel (one pair of > > TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not > > seem ideal for some workloads. However, it is possible to use a > > multi-queue qdisc, where a given queue is responsible for a range of > > sockets. This seems to be a better solution than having one device per > > socket, which may yield a very large number of devices and qdiscs, all > > of which are dynamically being created and destroyed. Because of this > > dynamism, it would also require a complex policy management daemon, as > > devices would constantly be spun up and down as sockets were created and > > destroyed. To avoid this, one device and qdisc also applies to all H2G > > sockets. > > > > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com> > > > I've been thinking about this generally. vsock currently > assumes reliability, but with qdisc can't we get > packet drops e.g. depending on the queueing? > > What prevents user from configuring such a discipline? > One thing people like about vsock is that it's very hard > to break H2G communication even with misconfigured > networking. > If qdisc decides to discard a packet, it returns NET_XMIT_CN via dev_queue_xmit(). This v1 allows this quietly, but v2 could return an error to the user (-ENOMEM or maybe -ENOBUFS) when this happens, similar to when vsock is unable to enqueue a packet currently. The user could still, for example, choose the noop qdisc. Assuming the v2 change mentioned above, their sendmsg() calls will return errors. Similar to how if they choose the wrong CID they will get an error when connecting a socket. Best, Bobby
On Mon, Aug 15, 2022 at 10:56:06AM -0700, Bobby Eshleman wrote: > In order to support usage of qdisc on vsock traffic, this commit > introduces a struct net_device to vhost and virtio vsock. > > Two new devices are created, vhost-vsock for vhost and virtio-vsock > for virtio. The devices are attached to the respective transports. > > To bypass the usage of the device, the user may "down" the associated > network interface using common tools. For example, "ip link set dev > virtio-vsock down" lets vsock bypass the net_device and qdisc entirely, > simply using the FIFO logic of the prior implementation. > > For both hosts and guests, there is one device for all G2H vsock sockets > and one device for all H2G vsock sockets. This makes sense for guests > because the driver only supports a single vsock channel (one pair of > TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not > seem ideal for some workloads. However, it is possible to use a > multi-queue qdisc, where a given queue is responsible for a range of > sockets. This seems to be a better solution than having one device per > socket, which may yield a very large number of devices and qdiscs, all > of which are dynamically being created and destroyed. Because of this > dynamism, it would also require a complex policy management daemon, as > devices would constantly be spun up and down as sockets were created and > destroyed. To avoid this, one device and qdisc also applies to all H2G > sockets. > > Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com> I've been thinking about this generally. vsock currently assumes reliability, but with qdisc can't we get packet drops e.g. depending on the queueing? What prevents user from configuring such a discipline? One thing people like about vsock is that it's very hard to break H2G communication even with misconfigured networking.
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index f8601d93d94d..b20ddec2664b 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -927,13 +927,30 @@ static int __init vhost_vsock_init(void) VSOCK_TRANSPORT_F_H2G); if (ret < 0) return ret; - return misc_register(&vhost_vsock_misc); + + ret = virtio_transport_init(&vhost_transport, "vhost-vsock"); + if (ret < 0) + goto out_unregister; + + ret = misc_register(&vhost_vsock_misc); + if (ret < 0) + goto out_transport_exit; + return ret; + +out_transport_exit: + virtio_transport_exit(&vhost_transport); + +out_unregister: + vsock_core_unregister(&vhost_transport.transport); + return ret; + }; static void __exit vhost_vsock_exit(void) { misc_deregister(&vhost_vsock_misc); vsock_core_unregister(&vhost_transport.transport); + virtio_transport_exit(&vhost_transport); }; module_init(vhost_vsock_init); diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 9a37eddbb87a..5d7e7fbd75f8 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -91,10 +91,20 @@ struct virtio_transport { /* This must be the first field */ struct vsock_transport transport; + /* Used almost exclusively for qdisc */ + struct net_device *dev; + /* Takes ownership of the packet */ int (*send_pkt)(struct sk_buff *skb); }; +int +virtio_transport_init(struct virtio_transport *t, + const char *name); + +void +virtio_transport_exit(struct virtio_transport *t); + ssize_t virtio_transport_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 3bb293fd8607..c6212eb38d3c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -131,7 +131,9 @@ virtio_transport_send_pkt_work(struct work_struct *work) * the vq */ if (ret < 0) { - skb_queue_head(&vsock->send_pkt_queue, skb); + spin_lock_bh(&vsock->send_pkt_queue.lock); + __skb_queue_head(&vsock->send_pkt_queue, skb); + spin_unlock_bh(&vsock->send_pkt_queue.lock); break; } @@ -676,7 +678,9 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) kfree_skb(skb); mutex_unlock(&vsock->tx_lock); - skb_queue_purge(&vsock->send_pkt_queue); + spin_lock_bh(&vsock->send_pkt_queue.lock); + __skb_queue_purge(&vsock->send_pkt_queue); + spin_unlock_bh(&vsock->send_pkt_queue.lock); /* Delete virtqueues and flush outstanding callbacks if any */ vdev->config->del_vqs(vdev); @@ -760,6 +764,8 @@ static void virtio_vsock_remove(struct virtio_device *vdev) flush_work(&vsock->event_work); flush_work(&vsock->send_pkt_work); + virtio_transport_exit(&virtio_transport); + mutex_unlock(&the_virtio_vsock_mutex); kfree(vsock); @@ -844,12 +850,18 @@ static int __init virtio_vsock_init(void) if (ret) goto out_wq; - ret = register_virtio_driver(&virtio_vsock_driver); + ret = virtio_transport_init(&virtio_transport, "virtio-vsock"); if (ret) goto out_vci; + ret = register_virtio_driver(&virtio_vsock_driver); + if (ret) + goto out_transport; + return 0; +out_transport: + virtio_transport_exit(&virtio_transport); out_vci: vsock_core_unregister(&virtio_transport.transport); out_wq: @@ -861,6 +873,7 @@ static void __exit virtio_vsock_exit(void) { unregister_virtio_driver(&virtio_vsock_driver); vsock_core_unregister(&virtio_transport.transport); + virtio_transport_exit(&virtio_transport); destroy_workqueue(virtio_vsock_workqueue); } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index d5780599fe93..bdf16fff054f 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -16,6 +16,7 @@ #include <net/sock.h> #include <net/af_vsock.h> +#include <net/pkt_sched.h> #define CREATE_TRACE_POINTS #include <trace/events/vsock_virtio_transport_common.h> @@ -23,6 +24,93 @@ /* How long to wait for graceful shutdown of a connection */ #define VSOCK_CLOSE_TIMEOUT (8 * HZ) +struct virtio_transport_priv { + struct virtio_transport *trans; +}; + +static netdev_tx_t virtio_transport_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct virtio_transport *t = + ((struct virtio_transport_priv *)netdev_priv(dev))->trans; + int ret; + + ret = t->send_pkt(skb); + if (unlikely(ret == -ENODEV)) + return NETDEV_TX_BUSY; + + return NETDEV_TX_OK; +} + +const struct net_device_ops virtio_transport_netdev_ops = { + .ndo_start_xmit = virtio_transport_start_xmit, +}; + +static void virtio_transport_setup(struct net_device *dev) +{ + dev->netdev_ops = &virtio_transport_netdev_ops; + dev->needs_free_netdev = true; + dev->flags = IFF_NOARP; + dev->mtu = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; +} + +static int ifup(struct net_device *dev) +{ + int ret; + + rtnl_lock(); + ret = dev_open(dev, NULL) ? -ENOMEM : 0; + rtnl_unlock(); + + return ret; +} + +/* virtio_transport_init - initialize a virtio vsock transport layer + * + * @t: ptr to the virtio transport struct to initialize + * @name: the name of the net_device to be created. + * + * Return 0 on success, otherwise negative errno. + */ +int virtio_transport_init(struct virtio_transport *t, const char *name) +{ + struct virtio_transport_priv *priv; + int ret; + + t->dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, virtio_transport_setup); + if (!t->dev) + return -ENOMEM; + + priv = netdev_priv(t->dev); + priv->trans = t; + + ret = register_netdev(t->dev); + if (ret < 0) + goto out_free_netdev; + + ret = ifup(t->dev); + if (ret < 0) + goto out_unregister_netdev; + + return 0; + +out_unregister_netdev: + unregister_netdev(t->dev); + +out_free_netdev: + free_netdev(t->dev); + + return ret; +} + +void virtio_transport_exit(struct virtio_transport *t) +{ + if (t->dev) { + unregister_netdev(t->dev); + free_netdev(t->dev); + } +} + static const struct virtio_transport * virtio_transport_get_ops(struct vsock_sock *vsk) { @@ -147,6 +235,24 @@ static u16 virtio_transport_get_type(struct sock *sk) return VIRTIO_VSOCK_TYPE_SEQPACKET; } +/* Return pkt->len on success, otherwise negative errno */ +static int virtio_transport_send_pkt(const struct virtio_transport *t, struct sk_buff *skb) +{ + int ret; + int len = skb->len; + + if (unlikely(!t->dev || !(t->dev->flags & IFF_UP))) + return t->send_pkt(skb); + + skb->dev = t->dev; + ret = dev_queue_xmit(skb); + + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) + return len; + + return -ENOMEM; +} + /* This function can only be used on connecting/connected sockets, * since a socket assigned to a transport is required. * @@ -202,9 +308,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, virtio_transport_inc_tx_pkt(vvs, skb); - err = t_ops->send_pkt(skb); - - return err < 0 ? -ENOMEM : err; + return virtio_transport_send_pkt(t_ops, skb); } static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, @@ -834,7 +938,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, return -ENOTCONN; } - return t->send_pkt(reply); + return virtio_transport_send_pkt(t, reply); } /* This function should be called with sk_lock held and SOCK_DONE set */
In order to support usage of qdisc on vsock traffic, this commit introduces a struct net_device to vhost and virtio vsock. Two new devices are created, vhost-vsock for vhost and virtio-vsock for virtio. The devices are attached to the respective transports. To bypass the usage of the device, the user may "down" the associated network interface using common tools. For example, "ip link set dev virtio-vsock down" lets vsock bypass the net_device and qdisc entirely, simply using the FIFO logic of the prior implementation. For both hosts and guests, there is one device for all G2H vsock sockets and one device for all H2G vsock sockets. This makes sense for guests because the driver only supports a single vsock channel (one pair of TX/RX virtqueues), so one device and qdisc fits. For hosts, this may not seem ideal for some workloads. However, it is possible to use a multi-queue qdisc, where a given queue is responsible for a range of sockets. This seems to be a better solution than having one device per socket, which may yield a very large number of devices and qdiscs, all of which are dynamically being created and destroyed. Because of this dynamism, it would also require a complex policy management daemon, as devices would constantly be spun up and down as sockets were created and destroyed. To avoid this, one device and qdisc also applies to all H2G sockets. Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com> --- drivers/vhost/vsock.c | 19 +++- include/linux/virtio_vsock.h | 10 +++ net/vmw_vsock/virtio_transport.c | 19 +++- net/vmw_vsock/virtio_transport_common.c | 112 +++++++++++++++++++++++- 4 files changed, 152 insertions(+), 8 deletions(-)