diff mbox series

[v4,4/5] vhost/vsock: split packets to send using multiple buffers

Message ID 20190717113030.163499-5-sgarzare@redhat.com (mailing list archive)
State New, archived
Headers show
Series vsock/virtio: optimizations to increase the throughput | expand

Commit Message

Stefano Garzarella July 17, 2019, 11:30 a.m. UTC
If the packets to sent to the guest are bigger than the buffer
available, we can split them, using multiple buffers and fixing
the length in the packet header.
This is safe since virtio-vsock supports only stream sockets.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/vhost/vsock.c                   | 66 ++++++++++++++++++-------
 net/vmw_vsock/virtio_transport_common.c | 15 ++++--
 2 files changed, 60 insertions(+), 21 deletions(-)

Comments

Michael S. Tsirkin July 17, 2019, 2:54 p.m. UTC | #1
On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> If the packets to sent to the guest are bigger than the buffer
> available, we can split them, using multiple buffers and fixing
> the length in the packet header.
> This is safe since virtio-vsock supports only stream sockets.
> 
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>

So how does it work right now? If an app
does sendmsg with a 64K buffer and the other
side publishes 4K buffers - does it just stall?


> ---
>  drivers/vhost/vsock.c                   | 66 ++++++++++++++++++-------
>  net/vmw_vsock/virtio_transport_common.c | 15 ++++--
>  2 files changed, 60 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index 6c8390a2af52..9f57736fe15e 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -102,7 +102,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  		struct iov_iter iov_iter;
>  		unsigned out, in;
>  		size_t nbytes;
> -		size_t len;
> +		size_t iov_len, payload_len;
>  		int head;
>  
>  		spin_lock_bh(&vsock->send_pkt_list_lock);
> @@ -147,8 +147,24 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  			break;
>  		}
>  
> -		len = iov_length(&vq->iov[out], in);
> -		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
> +		iov_len = iov_length(&vq->iov[out], in);
> +		if (iov_len < sizeof(pkt->hdr)) {
> +			virtio_transport_free_pkt(pkt);
> +			vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
> +			break;
> +		}
> +
> +		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, iov_len);
> +		payload_len = pkt->len - pkt->off;
> +
> +		/* If the packet is greater than the space available in the
> +		 * buffer, we split it using multiple buffers.
> +		 */
> +		if (payload_len > iov_len - sizeof(pkt->hdr))
> +			payload_len = iov_len - sizeof(pkt->hdr);
> +
> +		/* Set the correct length in the header */
> +		pkt->hdr.len = cpu_to_le32(payload_len);
>  
>  		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
>  		if (nbytes != sizeof(pkt->hdr)) {
> @@ -157,33 +173,47 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
>  			break;
>  		}
>  
> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> -		if (nbytes != pkt->len) {
> +		nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len,
> +				      &iov_iter);
> +		if (nbytes != payload_len) {
>  			virtio_transport_free_pkt(pkt);
>  			vq_err(vq, "Faulted on copying pkt buf\n");
>  			break;
>  		}
>  
> -		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
> +		vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
>  		added = true;
>  
> -		if (pkt->reply) {
> -			int val;
> -
> -			val = atomic_dec_return(&vsock->queued_replies);
> -
> -			/* Do we have resources to resume tx processing? */
> -			if (val + 1 == tx_vq->num)
> -				restart_tx = true;
> -		}
> -
>  		/* Deliver to monitoring devices all correctly transmitted
>  		 * packets.
>  		 */
>  		virtio_transport_deliver_tap_pkt(pkt);
>  
> -		total_len += pkt->len;
> -		virtio_transport_free_pkt(pkt);
> +		pkt->off += payload_len;
> +		total_len += payload_len;
> +
> +		/* If we didn't send all the payload we can requeue the packet
> +		 * to send it with the next available buffer.
> +		 */
> +		if (pkt->off < pkt->len) {
> +			spin_lock_bh(&vsock->send_pkt_list_lock);
> +			list_add(&pkt->list, &vsock->send_pkt_list);
> +			spin_unlock_bh(&vsock->send_pkt_list_lock);
> +		} else {
> +			if (pkt->reply) {
> +				int val;
> +
> +				val = atomic_dec_return(&vsock->queued_replies);
> +
> +				/* Do we have resources to resume tx
> +				 * processing?
> +				 */
> +				if (val + 1 == tx_vq->num)
> +					restart_tx = true;
> +			}
> +
> +			virtio_transport_free_pkt(pkt);
> +		}
>  	} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
>  	if (added)
>  		vhost_signal(&vsock->dev, vq);
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 34a2b42313b7..56fab3f03d0e 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -97,8 +97,17 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  	struct virtio_vsock_pkt *pkt = opaque;
>  	struct af_vsockmon_hdr *hdr;
>  	struct sk_buff *skb;
> +	size_t payload_len;
> +	void *payload_buf;
>  
> -	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
> +	/* A packet could be split to fit the RX buffer, so we can retrieve
> +	 * the payload length from the header and the buffer pointer taking
> +	 * care of the offset in the original packet.
> +	 */
> +	payload_len = le32_to_cpu(pkt->hdr.len);
> +	payload_buf = pkt->buf + pkt->off;
> +
> +	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len,
>  			GFP_ATOMIC);
>  	if (!skb)
>  		return NULL;
> @@ -138,8 +147,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  
>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
>  
> -	if (pkt->len) {
> -		skb_put_data(skb, pkt->buf, pkt->len);
> +	if (payload_len) {
> +		skb_put_data(skb, payload_buf, payload_len);
>  	}
>  
>  	return skb;
> -- 
> 2.20.1
Stefano Garzarella July 18, 2019, 7:50 a.m. UTC | #2
On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> > If the packets to sent to the guest are bigger than the buffer
> > available, we can split them, using multiple buffers and fixing
> > the length in the packet header.
> > This is safe since virtio-vsock supports only stream sockets.
> >
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>
> So how does it work right now? If an app
> does sendmsg with a 64K buffer and the other
> side publishes 4K buffers - does it just stall?

Before this series, the 64K (or bigger) user messages was split in 4K packets
(fixed in the code) and queued in an internal list for the TX worker.

After this series, we will queue up to 64K packets and then it will be split in
the TX worker, depending on the size of the buffers available in the
vring. (The idea was to allow EWMA or a configuration of the buffers size, but
for now we postponed it)

Note: virtio-vsock only supports stream socket for now.

Thanks,
Stefano
Michael S. Tsirkin July 18, 2019, 8:13 a.m. UTC | #3
On Thu, Jul 18, 2019 at 09:50:14AM +0200, Stefano Garzarella wrote:
> On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> > > If the packets to sent to the guest are bigger than the buffer
> > > available, we can split them, using multiple buffers and fixing
> > > the length in the packet header.
> > > This is safe since virtio-vsock supports only stream sockets.
> > >
> > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> >
> > So how does it work right now? If an app
> > does sendmsg with a 64K buffer and the other
> > side publishes 4K buffers - does it just stall?
> 
> Before this series, the 64K (or bigger) user messages was split in 4K packets
> (fixed in the code) and queued in an internal list for the TX worker.
> 
> After this series, we will queue up to 64K packets and then it will be split in
> the TX worker, depending on the size of the buffers available in the
> vring. (The idea was to allow EWMA or a configuration of the buffers size, but
> for now we postponed it)

Got it. Using workers for xmit is IMHO a bad idea btw.
Why is it done like this?

> Note: virtio-vsock only supports stream socket for now.
> 
> Thanks,
> Stefano
Stefano Garzarella July 18, 2019, 9:37 a.m. UTC | #4
On Thu, Jul 18, 2019 at 10:13 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> On Thu, Jul 18, 2019 at 09:50:14AM +0200, Stefano Garzarella wrote:
> > On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> > > > If the packets to sent to the guest are bigger than the buffer
> > > > available, we can split them, using multiple buffers and fixing
> > > > the length in the packet header.
> > > > This is safe since virtio-vsock supports only stream sockets.
> > > >
> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > >
> > > So how does it work right now? If an app
> > > does sendmsg with a 64K buffer and the other
> > > side publishes 4K buffers - does it just stall?
> >
> > Before this series, the 64K (or bigger) user messages was split in 4K packets
> > (fixed in the code) and queued in an internal list for the TX worker.
> >
> > After this series, we will queue up to 64K packets and then it will be split in
> > the TX worker, depending on the size of the buffers available in the
> > vring. (The idea was to allow EWMA or a configuration of the buffers size, but
> > for now we postponed it)
>
> Got it. Using workers for xmit is IMHO a bad idea btw.
> Why is it done like this?

Honestly, I don't know the exact reasons for this design, but I suppose
that the idea was to have only one worker that uses the vring, and
multiple user threads that enqueue packets in the list.
This can simplify the code and we can put the user threads to sleep if
we don't have "credit" available (this means that the receiver doesn't
have space to receive the packet).

What are the drawbacks in your opinion?


Thanks,
Stefano
Michael S. Tsirkin July 18, 2019, 11:35 a.m. UTC | #5
On Thu, Jul 18, 2019 at 11:37:30AM +0200, Stefano Garzarella wrote:
> On Thu, Jul 18, 2019 at 10:13 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Thu, Jul 18, 2019 at 09:50:14AM +0200, Stefano Garzarella wrote:
> > > On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> > > > > If the packets to sent to the guest are bigger than the buffer
> > > > > available, we can split them, using multiple buffers and fixing
> > > > > the length in the packet header.
> > > > > This is safe since virtio-vsock supports only stream sockets.
> > > > >
> > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > >
> > > > So how does it work right now? If an app
> > > > does sendmsg with a 64K buffer and the other
> > > > side publishes 4K buffers - does it just stall?
> > >
> > > Before this series, the 64K (or bigger) user messages was split in 4K packets
> > > (fixed in the code) and queued in an internal list for the TX worker.
> > >
> > > After this series, we will queue up to 64K packets and then it will be split in
> > > the TX worker, depending on the size of the buffers available in the
> > > vring. (The idea was to allow EWMA or a configuration of the buffers size, but
> > > for now we postponed it)
> >
> > Got it. Using workers for xmit is IMHO a bad idea btw.
> > Why is it done like this?
> 
> Honestly, I don't know the exact reasons for this design, but I suppose
> that the idea was to have only one worker that uses the vring, and
> multiple user threads that enqueue packets in the list.
> This can simplify the code and we can put the user threads to sleep if
> we don't have "credit" available (this means that the receiver doesn't
> have space to receive the packet).


I think you mean the reverse: even without credits you can copy from
user and queue up data, then process it without waking up the user
thread.
Does it help though? It certainly adds up work outside of
user thread context which means it's not accounted for
correctly.

Maybe we want more VQs. Would help improve parallelism. The question
would then become how to map sockets to VQs. With a simple hash
it's easy to create collisions ...


> 
> What are the drawbacks in your opinion?
> 
> 
> Thanks,
> Stefano

- More pressure on scheduler
- Increased latency
Stefano Garzarella July 19, 2019, 8:08 a.m. UTC | #6
On Thu, Jul 18, 2019 at 07:35:46AM -0400, Michael S. Tsirkin wrote:
> On Thu, Jul 18, 2019 at 11:37:30AM +0200, Stefano Garzarella wrote:
> > On Thu, Jul 18, 2019 at 10:13 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > On Thu, Jul 18, 2019 at 09:50:14AM +0200, Stefano Garzarella wrote:
> > > > On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> > > > > > If the packets to sent to the guest are bigger than the buffer
> > > > > > available, we can split them, using multiple buffers and fixing
> > > > > > the length in the packet header.
> > > > > > This is safe since virtio-vsock supports only stream sockets.
> > > > > >
> > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > >
> > > > > So how does it work right now? If an app
> > > > > does sendmsg with a 64K buffer and the other
> > > > > side publishes 4K buffers - does it just stall?
> > > >
> > > > Before this series, the 64K (or bigger) user messages was split in 4K packets
> > > > (fixed in the code) and queued in an internal list for the TX worker.
> > > >
> > > > After this series, we will queue up to 64K packets and then it will be split in
> > > > the TX worker, depending on the size of the buffers available in the
> > > > vring. (The idea was to allow EWMA or a configuration of the buffers size, but
> > > > for now we postponed it)
> > >
> > > Got it. Using workers for xmit is IMHO a bad idea btw.
> > > Why is it done like this?
> > 
> > Honestly, I don't know the exact reasons for this design, but I suppose
> > that the idea was to have only one worker that uses the vring, and
> > multiple user threads that enqueue packets in the list.
> > This can simplify the code and we can put the user threads to sleep if
> > we don't have "credit" available (this means that the receiver doesn't
> > have space to receive the packet).
> 
> 
> I think you mean the reverse: even without credits you can copy from
> user and queue up data, then process it without waking up the user
> thread.

I checked the code better, but it doesn't seem to do that.
The .sendmsg callback of af_vsock, check if the transport has space
(virtio-vsock transport returns the credit available). If there is no
space, it put the thread to sleep on the 'sk_sleep(sk)' wait_queue.

When the transport receives an update of credit available on the other
peer, it calls 'sk->sk_write_space(sk)' that wakes up the thread
sleeping, that will queue the new packet.

So, in the current implementation, the TX worker doesn't check the
credit available, it only sends the packets.

> Does it help though? It certainly adds up work outside of
> user thread context which means it's not accounted for
> correctly.

I can try to xmit the packet directly in the user thread context, to see
the improvements.

> 
> Maybe we want more VQs. Would help improve parallelism. The question
> would then become how to map sockets to VQs. With a simple hash
> it's easy to create collisions ...

Yes, more VQs can help but the map question is not simple to answer.
Maybe we can do an hash on the (cid, port) or do some kind of estimation
of queue utilization and try to balance.
Should the mapping be unique?

> 
> 
> > 
> > What are the drawbacks in your opinion?
> > 
> > 
> > Thanks,
> > Stefano
> 
> - More pressure on scheduler
> - Increased latency
> 

Thanks,
Stefano
Jason Wang July 19, 2019, 8:21 a.m. UTC | #7
On 2019/7/19 下午4:08, Stefano Garzarella wrote:
> On Thu, Jul 18, 2019 at 07:35:46AM -0400, Michael S. Tsirkin wrote:
>> On Thu, Jul 18, 2019 at 11:37:30AM +0200, Stefano Garzarella wrote:
>>> On Thu, Jul 18, 2019 at 10:13 AM Michael S. Tsirkin<mst@redhat.com>  wrote:
>>>> On Thu, Jul 18, 2019 at 09:50:14AM +0200, Stefano Garzarella wrote:
>>>>> On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin<mst@redhat.com>  wrote:
>>>>>> On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
>>>>>>> If the packets to sent to the guest are bigger than the buffer
>>>>>>> available, we can split them, using multiple buffers and fixing
>>>>>>> the length in the packet header.
>>>>>>> This is safe since virtio-vsock supports only stream sockets.
>>>>>>>
>>>>>>> Signed-off-by: Stefano Garzarella<sgarzare@redhat.com>
>>>>>> So how does it work right now? If an app
>>>>>> does sendmsg with a 64K buffer and the other
>>>>>> side publishes 4K buffers - does it just stall?
>>>>> Before this series, the 64K (or bigger) user messages was split in 4K packets
>>>>> (fixed in the code) and queued in an internal list for the TX worker.
>>>>>
>>>>> After this series, we will queue up to 64K packets and then it will be split in
>>>>> the TX worker, depending on the size of the buffers available in the
>>>>> vring. (The idea was to allow EWMA or a configuration of the buffers size, but
>>>>> for now we postponed it)
>>>> Got it. Using workers for xmit is IMHO a bad idea btw.
>>>> Why is it done like this?
>>> Honestly, I don't know the exact reasons for this design, but I suppose
>>> that the idea was to have only one worker that uses the vring, and
>>> multiple user threads that enqueue packets in the list.
>>> This can simplify the code and we can put the user threads to sleep if
>>> we don't have "credit" available (this means that the receiver doesn't
>>> have space to receive the packet).
>> I think you mean the reverse: even without credits you can copy from
>> user and queue up data, then process it without waking up the user
>> thread.
> I checked the code better, but it doesn't seem to do that.
> The .sendmsg callback of af_vsock, check if the transport has space
> (virtio-vsock transport returns the credit available). If there is no
> space, it put the thread to sleep on the 'sk_sleep(sk)' wait_queue.
>
> When the transport receives an update of credit available on the other
> peer, it calls 'sk->sk_write_space(sk)' that wakes up the thread
> sleeping, that will queue the new packet.
>
> So, in the current implementation, the TX worker doesn't check the
> credit available, it only sends the packets.
>
>> Does it help though? It certainly adds up work outside of
>> user thread context which means it's not accounted for
>> correctly.
> I can try to xmit the packet directly in the user thread context, to see
> the improvements.


It will then looks more like what virtio-net (and other networking 
device) did.


>
>> Maybe we want more VQs. Would help improve parallelism. The question
>> would then become how to map sockets to VQs. With a simple hash
>> it's easy to create collisions ...
> Yes, more VQs can help but the map question is not simple to answer.
> Maybe we can do an hash on the (cid, port) or do some kind of estimation
> of queue utilization and try to balance.
> Should the mapping be unique?


It sounds to me you want some kind of fair queuing? We've already had 
several qdiscs that do this.

So if we use the kernel networking xmit path, all those issues could be 
addressed.

Thanks

>
Stefano Garzarella July 19, 2019, 8:39 a.m. UTC | #8
On Fri, Jul 19, 2019 at 04:21:52PM +0800, Jason Wang wrote:
> 
> On 2019/7/19 下午4:08, Stefano Garzarella wrote:
> > On Thu, Jul 18, 2019 at 07:35:46AM -0400, Michael S. Tsirkin wrote:
> > > On Thu, Jul 18, 2019 at 11:37:30AM +0200, Stefano Garzarella wrote:
> > > > On Thu, Jul 18, 2019 at 10:13 AM Michael S. Tsirkin<mst@redhat.com>  wrote:
> > > > > On Thu, Jul 18, 2019 at 09:50:14AM +0200, Stefano Garzarella wrote:
> > > > > > On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin<mst@redhat.com>  wrote:
> > > > > > > On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> > > > > > > > If the packets to sent to the guest are bigger than the buffer
> > > > > > > > available, we can split them, using multiple buffers and fixing
> > > > > > > > the length in the packet header.
> > > > > > > > This is safe since virtio-vsock supports only stream sockets.
> > > > > > > > 
> > > > > > > > Signed-off-by: Stefano Garzarella<sgarzare@redhat.com>
> > > > > > > So how does it work right now? If an app
> > > > > > > does sendmsg with a 64K buffer and the other
> > > > > > > side publishes 4K buffers - does it just stall?
> > > > > > Before this series, the 64K (or bigger) user messages was split in 4K packets
> > > > > > (fixed in the code) and queued in an internal list for the TX worker.
> > > > > > 
> > > > > > After this series, we will queue up to 64K packets and then it will be split in
> > > > > > the TX worker, depending on the size of the buffers available in the
> > > > > > vring. (The idea was to allow EWMA or a configuration of the buffers size, but
> > > > > > for now we postponed it)
> > > > > Got it. Using workers for xmit is IMHO a bad idea btw.
> > > > > Why is it done like this?
> > > > Honestly, I don't know the exact reasons for this design, but I suppose
> > > > that the idea was to have only one worker that uses the vring, and
> > > > multiple user threads that enqueue packets in the list.
> > > > This can simplify the code and we can put the user threads to sleep if
> > > > we don't have "credit" available (this means that the receiver doesn't
> > > > have space to receive the packet).
> > > I think you mean the reverse: even without credits you can copy from
> > > user and queue up data, then process it without waking up the user
> > > thread.
> > I checked the code better, but it doesn't seem to do that.
> > The .sendmsg callback of af_vsock, check if the transport has space
> > (virtio-vsock transport returns the credit available). If there is no
> > space, it put the thread to sleep on the 'sk_sleep(sk)' wait_queue.
> > 
> > When the transport receives an update of credit available on the other
> > peer, it calls 'sk->sk_write_space(sk)' that wakes up the thread
> > sleeping, that will queue the new packet.
> > 
> > So, in the current implementation, the TX worker doesn't check the
> > credit available, it only sends the packets.
> > 
> > > Does it help though? It certainly adds up work outside of
> > > user thread context which means it's not accounted for
> > > correctly.
> > I can try to xmit the packet directly in the user thread context, to see
> > the improvements.
> 
> 
> It will then looks more like what virtio-net (and other networking device)
> did.

I'll try ASAP, the changes should not be too complicated... I hope :)

> 
> 
> > 
> > > Maybe we want more VQs. Would help improve parallelism. The question
> > > would then become how to map sockets to VQs. With a simple hash
> > > it's easy to create collisions ...
> > Yes, more VQs can help but the map question is not simple to answer.
> > Maybe we can do an hash on the (cid, port) or do some kind of estimation
> > of queue utilization and try to balance.
> > Should the mapping be unique?
> 
> 
> It sounds to me you want some kind of fair queuing? We've already had
> several qdiscs that do this.

Thanks for pointing it out!

> 
> So if we use the kernel networking xmit path, all those issues could be
> addressed.

One more point to AF_VSOCK + net-stack, but we have to evaluate possible
drawbacks in using the net-stack. (e.g. more latency due to the complexity
of the net-stack?)

Thanks,
Stefano
Jason Wang July 19, 2019, 8:51 a.m. UTC | #9
On 2019/7/19 下午4:39, Stefano Garzarella wrote:
> On Fri, Jul 19, 2019 at 04:21:52PM +0800, Jason Wang wrote:
>> On 2019/7/19 下午4:08, Stefano Garzarella wrote:
>>> On Thu, Jul 18, 2019 at 07:35:46AM -0400, Michael S. Tsirkin wrote:
>>>> On Thu, Jul 18, 2019 at 11:37:30AM +0200, Stefano Garzarella wrote:
>>>>> On Thu, Jul 18, 2019 at 10:13 AM Michael S. Tsirkin<mst@redhat.com>  wrote:
>>>>>> On Thu, Jul 18, 2019 at 09:50:14AM +0200, Stefano Garzarella wrote:
>>>>>>> On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin<mst@redhat.com>  wrote:
>>>>>>>> On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
>>>>>>>>> If the packets to sent to the guest are bigger than the buffer
>>>>>>>>> available, we can split them, using multiple buffers and fixing
>>>>>>>>> the length in the packet header.
>>>>>>>>> This is safe since virtio-vsock supports only stream sockets.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Stefano Garzarella<sgarzare@redhat.com>
>>>>>>>> So how does it work right now? If an app
>>>>>>>> does sendmsg with a 64K buffer and the other
>>>>>>>> side publishes 4K buffers - does it just stall?
>>>>>>> Before this series, the 64K (or bigger) user messages was split in 4K packets
>>>>>>> (fixed in the code) and queued in an internal list for the TX worker.
>>>>>>>
>>>>>>> After this series, we will queue up to 64K packets and then it will be split in
>>>>>>> the TX worker, depending on the size of the buffers available in the
>>>>>>> vring. (The idea was to allow EWMA or a configuration of the buffers size, but
>>>>>>> for now we postponed it)
>>>>>> Got it. Using workers for xmit is IMHO a bad idea btw.
>>>>>> Why is it done like this?
>>>>> Honestly, I don't know the exact reasons for this design, but I suppose
>>>>> that the idea was to have only one worker that uses the vring, and
>>>>> multiple user threads that enqueue packets in the list.
>>>>> This can simplify the code and we can put the user threads to sleep if
>>>>> we don't have "credit" available (this means that the receiver doesn't
>>>>> have space to receive the packet).
>>>> I think you mean the reverse: even without credits you can copy from
>>>> user and queue up data, then process it without waking up the user
>>>> thread.
>>> I checked the code better, but it doesn't seem to do that.
>>> The .sendmsg callback of af_vsock, check if the transport has space
>>> (virtio-vsock transport returns the credit available). If there is no
>>> space, it put the thread to sleep on the 'sk_sleep(sk)' wait_queue.
>>>
>>> When the transport receives an update of credit available on the other
>>> peer, it calls 'sk->sk_write_space(sk)' that wakes up the thread
>>> sleeping, that will queue the new packet.
>>>
>>> So, in the current implementation, the TX worker doesn't check the
>>> credit available, it only sends the packets.
>>>
>>>> Does it help though? It certainly adds up work outside of
>>>> user thread context which means it's not accounted for
>>>> correctly.
>>> I can try to xmit the packet directly in the user thread context, to see
>>> the improvements.
>>
>> It will then looks more like what virtio-net (and other networking device)
>> did.
> I'll try ASAP, the changes should not be too complicated... I hope :)
>
>>
>>>> Maybe we want more VQs. Would help improve parallelism. The question
>>>> would then become how to map sockets to VQs. With a simple hash
>>>> it's easy to create collisions ...
>>> Yes, more VQs can help but the map question is not simple to answer.
>>> Maybe we can do an hash on the (cid, port) or do some kind of estimation
>>> of queue utilization and try to balance.
>>> Should the mapping be unique?
>>
>> It sounds to me you want some kind of fair queuing? We've already had
>> several qdiscs that do this.
> Thanks for pointing it out!
>
>> So if we use the kernel networking xmit path, all those issues could be
>> addressed.
> One more point to AF_VSOCK + net-stack, but we have to evaluate possible
> drawbacks in using the net-stack. (e.g. more latency due to the complexity
> of the net-stack?)


Yes, we need benchmark the performance. But as we've noticed, current 
vsock implementation is not efficient, and for stream socket, the 
overhead should be minimal. The most important thing is to avoid 
reinventing things that has already existed.

Thanks


>
> Thanks,
> Stefano
Stefano Garzarella July 19, 2019, 9:20 a.m. UTC | #10
On Fri, Jul 19, 2019 at 04:51:00PM +0800, Jason Wang wrote:
> 
> On 2019/7/19 下午4:39, Stefano Garzarella wrote:
> > On Fri, Jul 19, 2019 at 04:21:52PM +0800, Jason Wang wrote:
> > > On 2019/7/19 下午4:08, Stefano Garzarella wrote:
> > > > On Thu, Jul 18, 2019 at 07:35:46AM -0400, Michael S. Tsirkin wrote:
> > > > > On Thu, Jul 18, 2019 at 11:37:30AM +0200, Stefano Garzarella wrote:
> > > > > > On Thu, Jul 18, 2019 at 10:13 AM Michael S. Tsirkin<mst@redhat.com>  wrote:
> > > > > > > On Thu, Jul 18, 2019 at 09:50:14AM +0200, Stefano Garzarella wrote:
> > > > > > > > On Wed, Jul 17, 2019 at 4:55 PM Michael S. Tsirkin<mst@redhat.com>  wrote:
> > > > > > > > > On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> > > > > > > > > > If the packets to sent to the guest are bigger than the buffer
> > > > > > > > > > available, we can split them, using multiple buffers and fixing
> > > > > > > > > > the length in the packet header.
> > > > > > > > > > This is safe since virtio-vsock supports only stream sockets.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Stefano Garzarella<sgarzare@redhat.com>
> > > > > > > > > So how does it work right now? If an app
> > > > > > > > > does sendmsg with a 64K buffer and the other
> > > > > > > > > side publishes 4K buffers - does it just stall?
> > > > > > > > Before this series, the 64K (or bigger) user messages was split in 4K packets
> > > > > > > > (fixed in the code) and queued in an internal list for the TX worker.
> > > > > > > > 
> > > > > > > > After this series, we will queue up to 64K packets and then it will be split in
> > > > > > > > the TX worker, depending on the size of the buffers available in the
> > > > > > > > vring. (The idea was to allow EWMA or a configuration of the buffers size, but
> > > > > > > > for now we postponed it)
> > > > > > > Got it. Using workers for xmit is IMHO a bad idea btw.
> > > > > > > Why is it done like this?
> > > > > > Honestly, I don't know the exact reasons for this design, but I suppose
> > > > > > that the idea was to have only one worker that uses the vring, and
> > > > > > multiple user threads that enqueue packets in the list.
> > > > > > This can simplify the code and we can put the user threads to sleep if
> > > > > > we don't have "credit" available (this means that the receiver doesn't
> > > > > > have space to receive the packet).
> > > > > I think you mean the reverse: even without credits you can copy from
> > > > > user and queue up data, then process it without waking up the user
> > > > > thread.
> > > > I checked the code better, but it doesn't seem to do that.
> > > > The .sendmsg callback of af_vsock, check if the transport has space
> > > > (virtio-vsock transport returns the credit available). If there is no
> > > > space, it put the thread to sleep on the 'sk_sleep(sk)' wait_queue.
> > > > 
> > > > When the transport receives an update of credit available on the other
> > > > peer, it calls 'sk->sk_write_space(sk)' that wakes up the thread
> > > > sleeping, that will queue the new packet.
> > > > 
> > > > So, in the current implementation, the TX worker doesn't check the
> > > > credit available, it only sends the packets.
> > > > 
> > > > > Does it help though? It certainly adds up work outside of
> > > > > user thread context which means it's not accounted for
> > > > > correctly.
> > > > I can try to xmit the packet directly in the user thread context, to see
> > > > the improvements.
> > > 
> > > It will then looks more like what virtio-net (and other networking device)
> > > did.
> > I'll try ASAP, the changes should not be too complicated... I hope :)
> > 
> > > 
> > > > > Maybe we want more VQs. Would help improve parallelism. The question
> > > > > would then become how to map sockets to VQs. With a simple hash
> > > > > it's easy to create collisions ...
> > > > Yes, more VQs can help but the map question is not simple to answer.
> > > > Maybe we can do an hash on the (cid, port) or do some kind of estimation
> > > > of queue utilization and try to balance.
> > > > Should the mapping be unique?
> > > 
> > > It sounds to me you want some kind of fair queuing? We've already had
> > > several qdiscs that do this.
> > Thanks for pointing it out!
> > 
> > > So if we use the kernel networking xmit path, all those issues could be
> > > addressed.
> > One more point to AF_VSOCK + net-stack, but we have to evaluate possible
> > drawbacks in using the net-stack. (e.g. more latency due to the complexity
> > of the net-stack?)
> 
> 
> Yes, we need benchmark the performance. But as we've noticed, current vsock
> implementation is not efficient, and for stream socket, the overhead should
> be minimal. The most important thing is to avoid reinventing things that has
> already existed.

Got it. I completely agree with you, and I want to avoid reinventing things
(surely in a worse way).

But the idea (suggested also by Micheal) to discover how fast can go a
new protocol separate from the networking stack, is quite attractive :)

Thanks,
Stefano
Stefan Hajnoczi July 22, 2019, 9:06 a.m. UTC | #11
On Wed, Jul 17, 2019 at 01:30:29PM +0200, Stefano Garzarella wrote:
> If the packets to sent to the guest are bigger than the buffer
> available, we can split them, using multiple buffers and fixing
> the length in the packet header.
> This is safe since virtio-vsock supports only stream sockets.
> 
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> ---
>  drivers/vhost/vsock.c                   | 66 ++++++++++++++++++-------
>  net/vmw_vsock/virtio_transport_common.c | 15 ++++--
>  2 files changed, 60 insertions(+), 21 deletions(-)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
diff mbox series

Patch

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 6c8390a2af52..9f57736fe15e 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -102,7 +102,7 @@  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		struct iov_iter iov_iter;
 		unsigned out, in;
 		size_t nbytes;
-		size_t len;
+		size_t iov_len, payload_len;
 		int head;
 
 		spin_lock_bh(&vsock->send_pkt_list_lock);
@@ -147,8 +147,24 @@  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		len = iov_length(&vq->iov[out], in);
-		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
+		iov_len = iov_length(&vq->iov[out], in);
+		if (iov_len < sizeof(pkt->hdr)) {
+			virtio_transport_free_pkt(pkt);
+			vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
+			break;
+		}
+
+		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, iov_len);
+		payload_len = pkt->len - pkt->off;
+
+		/* If the packet is greater than the space available in the
+		 * buffer, we split it using multiple buffers.
+		 */
+		if (payload_len > iov_len - sizeof(pkt->hdr))
+			payload_len = iov_len - sizeof(pkt->hdr);
+
+		/* Set the correct length in the header */
+		pkt->hdr.len = cpu_to_le32(payload_len);
 
 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
 		if (nbytes != sizeof(pkt->hdr)) {
@@ -157,33 +173,47 @@  vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
-		if (nbytes != pkt->len) {
+		nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len,
+				      &iov_iter);
+		if (nbytes != payload_len) {
 			virtio_transport_free_pkt(pkt);
 			vq_err(vq, "Faulted on copying pkt buf\n");
 			break;
 		}
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
 		added = true;
 
-		if (pkt->reply) {
-			int val;
-
-			val = atomic_dec_return(&vsock->queued_replies);
-
-			/* Do we have resources to resume tx processing? */
-			if (val + 1 == tx_vq->num)
-				restart_tx = true;
-		}
-
 		/* Deliver to monitoring devices all correctly transmitted
 		 * packets.
 		 */
 		virtio_transport_deliver_tap_pkt(pkt);
 
-		total_len += pkt->len;
-		virtio_transport_free_pkt(pkt);
+		pkt->off += payload_len;
+		total_len += payload_len;
+
+		/* If we didn't send all the payload we can requeue the packet
+		 * to send it with the next available buffer.
+		 */
+		if (pkt->off < pkt->len) {
+			spin_lock_bh(&vsock->send_pkt_list_lock);
+			list_add(&pkt->list, &vsock->send_pkt_list);
+			spin_unlock_bh(&vsock->send_pkt_list_lock);
+		} else {
+			if (pkt->reply) {
+				int val;
+
+				val = atomic_dec_return(&vsock->queued_replies);
+
+				/* Do we have resources to resume tx
+				 * processing?
+				 */
+				if (val + 1 == tx_vq->num)
+					restart_tx = true;
+			}
+
+			virtio_transport_free_pkt(pkt);
+		}
 	} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
 	if (added)
 		vhost_signal(&vsock->dev, vq);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 34a2b42313b7..56fab3f03d0e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -97,8 +97,17 @@  static struct sk_buff *virtio_transport_build_skb(void *opaque)
 	struct virtio_vsock_pkt *pkt = opaque;
 	struct af_vsockmon_hdr *hdr;
 	struct sk_buff *skb;
+	size_t payload_len;
+	void *payload_buf;
 
-	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
+	/* A packet could be split to fit the RX buffer, so we can retrieve
+	 * the payload length from the header and the buffer pointer taking
+	 * care of the offset in the original packet.
+	 */
+	payload_len = le32_to_cpu(pkt->hdr.len);
+	payload_buf = pkt->buf + pkt->off;
+
+	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len,
 			GFP_ATOMIC);
 	if (!skb)
 		return NULL;
@@ -138,8 +147,8 @@  static struct sk_buff *virtio_transport_build_skb(void *opaque)
 
 	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
 
-	if (pkt->len) {
-		skb_put_data(skb, pkt->buf, pkt->len);
+	if (payload_len) {
+		skb_put_data(skb, payload_buf, payload_len);
 	}
 
 	return skb;