diff mbox series

[v2,1/8] vsock/virtio: limit the memory used per-socket

Message ID 20190510125843.95587-2-sgarzare@redhat.com (mailing list archive)
State New, archived
Headers show
Series vsock/virtio: optimizations to increase the throughput | expand

Commit Message

Stefano Garzarella May 10, 2019, 12:58 p.m. UTC
Since virtio-vsock was introduced, the buffers filled by the host
and pushed to the guest using the vring, are directly queued in
a per-socket list avoiding to copy it.
These buffers are preallocated by the guest with a fixed
size (4 KB).

The maximum amount of memory used by each socket should be
controlled by the credit mechanism.
The default credit available per-socket is 256 KB, but if we use
only 1 byte per packet, the guest can queue up to 262144 of 4 KB
buffers, using up to 1 GB of memory per-socket. In addition, the
guest will continue to fill the vring with new 4 KB free buffers
to avoid starvation of other sockets.

This patch solves this issue copying the payload in a new buffer.
Then it is queued in the per-socket list, and the 4KB buffer used
by the host is freed.

In this way, the memory used by each socket respects the credit
available, and we still avoid starvation, paying the cost of an
extra memory copy. When the buffer is completely full we do a
"zero-copy", moving the buffer directly in the per-socket list.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/vhost/vsock.c                   |  2 +
 include/linux/virtio_vsock.h            |  8 +++
 net/vmw_vsock/virtio_transport.c        |  1 +
 net/vmw_vsock/virtio_transport_common.c | 95 ++++++++++++++++++-------
 4 files changed, 81 insertions(+), 25 deletions(-)

Comments

Michael S. Tsirkin May 12, 2019, 4:57 p.m. UTC | #1
On Fri, May 10, 2019 at 02:58:36PM +0200, Stefano Garzarella wrote:
> Since virtio-vsock was introduced, the buffers filled by the host
> and pushed to the guest using the vring, are directly queued in
> a per-socket list avoiding to copy it.
> These buffers are preallocated by the guest with a fixed
> size (4 KB).
> 
> The maximum amount of memory used by each socket should be
> controlled by the credit mechanism.
> The default credit available per-socket is 256 KB, but if we use
> only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> buffers, using up to 1 GB of memory per-socket. In addition, the
> guest will continue to fill the vring with new 4 KB free buffers
> to avoid starvation of other sockets.
> 
> This patch solves this issue copying the payload in a new buffer.
> Then it is queued in the per-socket list, and the 4KB buffer used
> by the host is freed.
> 
> In this way, the memory used by each socket respects the credit
> available, and we still avoid starvation, paying the cost of an
> extra memory copy. When the buffer is completely full we do a
> "zero-copy", moving the buffer directly in the per-socket list.
> 
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> ---
>  drivers/vhost/vsock.c                   |  2 +
>  include/linux/virtio_vsock.h            |  8 +++
>  net/vmw_vsock/virtio_transport.c        |  1 +
>  net/vmw_vsock/virtio_transport_common.c | 95 ++++++++++++++++++-------
>  4 files changed, 81 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index bb5fc0e9fbc2..7964e2daee09 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
>  		return NULL;
>  	}
>  
> +	pkt->buf_len = pkt->len;
> +
>  	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>  	if (nbytes != pkt->len) {
>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index e223e2632edd..345f04ee9193 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -54,9 +54,17 @@ struct virtio_vsock_pkt {
>  	void *buf;
>  	u32 len;
>  	u32 off;
> +	u32 buf_len;
>  	bool reply;
>  };
>  
> +struct virtio_vsock_buf {
> +	struct list_head list;
> +	void *addr;
> +	u32 len;
> +	u32 off;
> +};
> +
>  struct virtio_vsock_pkt_info {
>  	u32 remote_cid, remote_port;
>  	struct vsock_sock *vsk;
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index 15eb5d3d4750..af1d2ce12f54 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
>  			break;
>  		}
>  
> +		pkt->buf_len = buf_len;
>  		pkt->len = buf_len;
>  
>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 602715fc9a75..0248d6808755 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -65,6 +65,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>  		pkt->buf = kmalloc(len, GFP_KERNEL);
>  		if (!pkt->buf)
>  			goto out_pkt;
> +
> +		pkt->buf_len = len;
> +
>  		err = memcpy_from_msg(pkt->buf, info->msg, len);
>  		if (err)
>  			goto out;
> @@ -86,6 +89,46 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>  	return NULL;
>  }
>  
> +static struct virtio_vsock_buf *
> +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> +{
> +	struct virtio_vsock_buf *buf;
> +
> +	if (pkt->len == 0)
> +		return NULL;
> +
> +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> +	if (!buf)
> +		return NULL;
> +
> +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> +	 * we are not use

we do not use

> more memory than that counted by the credit mechanism.
> +	 */
> +	if (zero_copy && pkt->len == pkt->buf_len) {
> +		buf->addr = pkt->buf;
> +		pkt->buf = NULL;
> +	} else {
> +		buf->addr = kmalloc(pkt->len, GFP_KERNEL);
> +		if (!buf->addr) {
> +			kfree(buf);
> +			return NULL;
> +		}
> +
> +		memcpy(buf->addr, pkt->buf, pkt->len);
> +	}
> +
> +	buf->len = pkt->len;
> +
> +	return buf;
> +}
> +
> +static void virtio_transport_free_buf(struct virtio_vsock_buf *buf)
> +{
> +	kfree(buf->addr);
> +	kfree(buf);
> +}
> +
>  /* Packet capture */
>  static struct sk_buff *virtio_transport_build_skb(void *opaque)
>  {
> @@ -190,17 +233,15 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>  	return virtio_transport_get_ops()->send_pkt(pkt);
>  }
>  
> -static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
> -					struct virtio_vsock_pkt *pkt)
> +static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len)
>  {
> -	vvs->rx_bytes += pkt->len;
> +	vvs->rx_bytes += len;
>  }
>  
> -static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
> -					struct virtio_vsock_pkt *pkt)
> +static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, u32 len)
>  {
> -	vvs->rx_bytes -= pkt->len;
> -	vvs->fwd_cnt += pkt->len;
> +	vvs->rx_bytes -= len;
> +	vvs->fwd_cnt += len;
>  }
>  
>  void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
> @@ -254,36 +295,36 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
>  				   size_t len)
>  {
>  	struct virtio_vsock_sock *vvs = vsk->trans;
> -	struct virtio_vsock_pkt *pkt;
> +	struct virtio_vsock_buf *buf;
>  	size_t bytes, total = 0;
>  	int err = -EFAULT;
>  
>  	spin_lock_bh(&vvs->rx_lock);
>  	while (total < len && !list_empty(&vvs->rx_queue)) {
> -		pkt = list_first_entry(&vvs->rx_queue,
> -				       struct virtio_vsock_pkt, list);
> +		buf = list_first_entry(&vvs->rx_queue,
> +				       struct virtio_vsock_buf, list);
>  
>  		bytes = len - total;
> -		if (bytes > pkt->len - pkt->off)
> -			bytes = pkt->len - pkt->off;
> +		if (bytes > buf->len - buf->off)
> +			bytes = buf->len - buf->off;
>  
>  		/* sk_lock is held by caller so no one else can dequeue.
>  		 * Unlock rx_lock since memcpy_to_msg() may sleep.
>  		 */
>  		spin_unlock_bh(&vvs->rx_lock);
>  
> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> +		err = memcpy_to_msg(msg, buf->addr + buf->off, bytes);
>  		if (err)
>  			goto out;
>  
>  		spin_lock_bh(&vvs->rx_lock);
>  
>  		total += bytes;
> -		pkt->off += bytes;
> -		if (pkt->off == pkt->len) {
> -			virtio_transport_dec_rx_pkt(vvs, pkt);
> -			list_del(&pkt->list);
> -			virtio_transport_free_pkt(pkt);
> +		buf->off += bytes;
> +		if (buf->off == buf->len) {
> +			virtio_transport_dec_rx_pkt(vvs, buf->len);
> +			list_del(&buf->list);
> +			virtio_transport_free_buf(buf);
>  		}
>  	}
>  	spin_unlock_bh(&vvs->rx_lock);
> @@ -841,20 +882,24 @@ virtio_transport_recv_connected(struct sock *sk,
>  {
>  	struct vsock_sock *vsk = vsock_sk(sk);
>  	struct virtio_vsock_sock *vvs = vsk->trans;
> +	struct virtio_vsock_buf *buf;
>  	int err = 0;
>  
>  	switch (le16_to_cpu(pkt->hdr.op)) {
>  	case VIRTIO_VSOCK_OP_RW:
>  		pkt->len = le32_to_cpu(pkt->hdr.len);
> -		pkt->off = 0;
> +		buf = virtio_transport_alloc_buf(pkt, true);


This seems to be the only callers and second parameter
is always true. So why is it needed?

>  
> -		spin_lock_bh(&vvs->rx_lock);
> -		virtio_transport_inc_rx_pkt(vvs, pkt);
> -		list_add_tail(&pkt->list, &vvs->rx_queue);
> -		spin_unlock_bh(&vvs->rx_lock);
> +		if (buf) {
> +			spin_lock_bh(&vvs->rx_lock);
> +			virtio_transport_inc_rx_pkt(vvs, pkt->len);
> +			list_add_tail(&buf->list, &vvs->rx_queue);
> +			spin_unlock_bh(&vvs->rx_lock);
>  
> -		sk->sk_data_ready(sk);
> -		return err;
> +			sk->sk_data_ready(sk);
> +		}
> +
> +		break;
>  	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
>  		sk->sk_write_space(sk);
>  		break;
> -- 
> 2.20.1
Jason Wang May 13, 2019, 9:58 a.m. UTC | #2
On 2019/5/10 下午8:58, Stefano Garzarella wrote:
> Since virtio-vsock was introduced, the buffers filled by the host
> and pushed to the guest using the vring, are directly queued in
> a per-socket list avoiding to copy it.
> These buffers are preallocated by the guest with a fixed
> size (4 KB).
>
> The maximum amount of memory used by each socket should be
> controlled by the credit mechanism.
> The default credit available per-socket is 256 KB, but if we use
> only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> buffers, using up to 1 GB of memory per-socket. In addition, the
> guest will continue to fill the vring with new 4 KB free buffers
> to avoid starvation of other sockets.
>
> This patch solves this issue copying the payload in a new buffer.
> Then it is queued in the per-socket list, and the 4KB buffer used
> by the host is freed.
>
> In this way, the memory used by each socket respects the credit
> available, and we still avoid starvation, paying the cost of an
> extra memory copy. When the buffer is completely full we do a
> "zero-copy", moving the buffer directly in the per-socket list.


I wonder in the long run we should use generic socket accouting 
mechanism provided by kernel (e.g socket, skb, sndbuf, recvbug, 
truesize) instead of vsock specific thing to avoid duplicating efforts.


>
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> ---
>   drivers/vhost/vsock.c                   |  2 +
>   include/linux/virtio_vsock.h            |  8 +++
>   net/vmw_vsock/virtio_transport.c        |  1 +
>   net/vmw_vsock/virtio_transport_common.c | 95 ++++++++++++++++++-------
>   4 files changed, 81 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index bb5fc0e9fbc2..7964e2daee09 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
>   		return NULL;
>   	}
>   
> +	pkt->buf_len = pkt->len;
> +
>   	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>   	if (nbytes != pkt->len) {
>   		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index e223e2632edd..345f04ee9193 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -54,9 +54,17 @@ struct virtio_vsock_pkt {
>   	void *buf;
>   	u32 len;
>   	u32 off;
> +	u32 buf_len;
>   	bool reply;
>   };
>   
> +struct virtio_vsock_buf {
> +	struct list_head list;
> +	void *addr;
> +	u32 len;
> +	u32 off;
> +};
> +
>   struct virtio_vsock_pkt_info {
>   	u32 remote_cid, remote_port;
>   	struct vsock_sock *vsk;
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index 15eb5d3d4750..af1d2ce12f54 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
>   			break;
>   		}
>   
> +		pkt->buf_len = buf_len;
>   		pkt->len = buf_len;
>   
>   		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 602715fc9a75..0248d6808755 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -65,6 +65,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>   		pkt->buf = kmalloc(len, GFP_KERNEL);
>   		if (!pkt->buf)
>   			goto out_pkt;
> +
> +		pkt->buf_len = len;
> +
>   		err = memcpy_from_msg(pkt->buf, info->msg, len);
>   		if (err)
>   			goto out;
> @@ -86,6 +89,46 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>   	return NULL;
>   }
>   
> +static struct virtio_vsock_buf *
> +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> +{
> +	struct virtio_vsock_buf *buf;
> +
> +	if (pkt->len == 0)
> +		return NULL;
> +
> +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> +	if (!buf)
> +		return NULL;
> +
> +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> +	 * we are not use more memory than that counted by the credit mechanism.
> +	 */
> +	if (zero_copy && pkt->len == pkt->buf_len) {
> +		buf->addr = pkt->buf;
> +		pkt->buf = NULL;
> +	} else {


Is the copy still needed if we're just few bytes less? We meet similar 
issue for virito-net, and virtio-net solve this by always copy first 
128bytes for big packets.

See receive_big().

Thanks


> +		buf->addr = kmalloc(pkt->len, GFP_KERNEL);
> +		if (!buf->addr) {
> +			kfree(buf);
> +			return NULL;
> +		}
> +
> +		memcpy(buf->addr, pkt->buf, pkt->len);
> +	}
> +
> +	buf->len = pkt->len;
> +
> +	return buf;
> +}
> +
> +static void virtio_transport_free_buf(struct virtio_vsock_buf *buf)
> +{
> +	kfree(buf->addr);
> +	kfree(buf);
> +}
> +
>   /* Packet capture */
>   static struct sk_buff *virtio_transport_build_skb(void *opaque)
>   {
> @@ -190,17 +233,15 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>   	return virtio_transport_get_ops()->send_pkt(pkt);
>   }
>   
> -static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
> -					struct virtio_vsock_pkt *pkt)
> +static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len)
>   {
> -	vvs->rx_bytes += pkt->len;
> +	vvs->rx_bytes += len;
>   }
>   
> -static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
> -					struct virtio_vsock_pkt *pkt)
> +static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, u32 len)
>   {
> -	vvs->rx_bytes -= pkt->len;
> -	vvs->fwd_cnt += pkt->len;
> +	vvs->rx_bytes -= len;
> +	vvs->fwd_cnt += len;
>   }
>   
>   void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
> @@ -254,36 +295,36 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
>   				   size_t len)
>   {
>   	struct virtio_vsock_sock *vvs = vsk->trans;
> -	struct virtio_vsock_pkt *pkt;
> +	struct virtio_vsock_buf *buf;
>   	size_t bytes, total = 0;
>   	int err = -EFAULT;
>   
>   	spin_lock_bh(&vvs->rx_lock);
>   	while (total < len && !list_empty(&vvs->rx_queue)) {
> -		pkt = list_first_entry(&vvs->rx_queue,
> -				       struct virtio_vsock_pkt, list);
> +		buf = list_first_entry(&vvs->rx_queue,
> +				       struct virtio_vsock_buf, list);
>   
>   		bytes = len - total;
> -		if (bytes > pkt->len - pkt->off)
> -			bytes = pkt->len - pkt->off;
> +		if (bytes > buf->len - buf->off)
> +			bytes = buf->len - buf->off;
>   
>   		/* sk_lock is held by caller so no one else can dequeue.
>   		 * Unlock rx_lock since memcpy_to_msg() may sleep.
>   		 */
>   		spin_unlock_bh(&vvs->rx_lock);
>   
> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> +		err = memcpy_to_msg(msg, buf->addr + buf->off, bytes);
>   		if (err)
>   			goto out;
>   
>   		spin_lock_bh(&vvs->rx_lock);
>   
>   		total += bytes;
> -		pkt->off += bytes;
> -		if (pkt->off == pkt->len) {
> -			virtio_transport_dec_rx_pkt(vvs, pkt);
> -			list_del(&pkt->list);
> -			virtio_transport_free_pkt(pkt);
> +		buf->off += bytes;
> +		if (buf->off == buf->len) {
> +			virtio_transport_dec_rx_pkt(vvs, buf->len);
> +			list_del(&buf->list);
> +			virtio_transport_free_buf(buf);
>   		}
>   	}
>   	spin_unlock_bh(&vvs->rx_lock);
> @@ -841,20 +882,24 @@ virtio_transport_recv_connected(struct sock *sk,
>   {
>   	struct vsock_sock *vsk = vsock_sk(sk);
>   	struct virtio_vsock_sock *vvs = vsk->trans;
> +	struct virtio_vsock_buf *buf;
>   	int err = 0;
>   
>   	switch (le16_to_cpu(pkt->hdr.op)) {
>   	case VIRTIO_VSOCK_OP_RW:
>   		pkt->len = le32_to_cpu(pkt->hdr.len);
> -		pkt->off = 0;
> +		buf = virtio_transport_alloc_buf(pkt, true);
>   
> -		spin_lock_bh(&vvs->rx_lock);
> -		virtio_transport_inc_rx_pkt(vvs, pkt);
> -		list_add_tail(&pkt->list, &vvs->rx_queue);
> -		spin_unlock_bh(&vvs->rx_lock);
> +		if (buf) {
> +			spin_lock_bh(&vvs->rx_lock);
> +			virtio_transport_inc_rx_pkt(vvs, pkt->len);
> +			list_add_tail(&buf->list, &vvs->rx_queue);
> +			spin_unlock_bh(&vvs->rx_lock);
>   
> -		sk->sk_data_ready(sk);
> -		return err;
> +			sk->sk_data_ready(sk);
> +		}
> +
> +		break;
>   	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
>   		sk->sk_write_space(sk);
>   		break;
Stefano Garzarella May 13, 2019, 4:40 p.m. UTC | #3
On Sun, May 12, 2019 at 12:57:48PM -0400, Michael S. Tsirkin wrote:
> On Fri, May 10, 2019 at 02:58:36PM +0200, Stefano Garzarella wrote:
> > Since virtio-vsock was introduced, the buffers filled by the host
> > and pushed to the guest using the vring, are directly queued in
> > a per-socket list avoiding to copy it.
> > These buffers are preallocated by the guest with a fixed
> > size (4 KB).
> > 
> > The maximum amount of memory used by each socket should be
> > controlled by the credit mechanism.
> > The default credit available per-socket is 256 KB, but if we use
> > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > buffers, using up to 1 GB of memory per-socket. In addition, the
> > guest will continue to fill the vring with new 4 KB free buffers
> > to avoid starvation of other sockets.
> > 
> > This patch solves this issue copying the payload in a new buffer.
> > Then it is queued in the per-socket list, and the 4KB buffer used
> > by the host is freed.
> > 
> > In this way, the memory used by each socket respects the credit
> > available, and we still avoid starvation, paying the cost of an
> > extra memory copy. When the buffer is completely full we do a
> > "zero-copy", moving the buffer directly in the per-socket list.
> > 
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > ---
> >  drivers/vhost/vsock.c                   |  2 +
> >  include/linux/virtio_vsock.h            |  8 +++
> >  net/vmw_vsock/virtio_transport.c        |  1 +
> >  net/vmw_vsock/virtio_transport_common.c | 95 ++++++++++++++++++-------
> >  4 files changed, 81 insertions(+), 25 deletions(-)
> > 
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index bb5fc0e9fbc2..7964e2daee09 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
> >  		return NULL;
> >  	}
> >  
> > +	pkt->buf_len = pkt->len;
> > +
> >  	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
> >  	if (nbytes != pkt->len) {
> >  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > index e223e2632edd..345f04ee9193 100644
> > --- a/include/linux/virtio_vsock.h
> > +++ b/include/linux/virtio_vsock.h
> > @@ -54,9 +54,17 @@ struct virtio_vsock_pkt {
> >  	void *buf;
> >  	u32 len;
> >  	u32 off;
> > +	u32 buf_len;
> >  	bool reply;
> >  };
> >  
> > +struct virtio_vsock_buf {
> > +	struct list_head list;
> > +	void *addr;
> > +	u32 len;
> > +	u32 off;
> > +};
> > +
> >  struct virtio_vsock_pkt_info {
> >  	u32 remote_cid, remote_port;
> >  	struct vsock_sock *vsk;
> > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> > index 15eb5d3d4750..af1d2ce12f54 100644
> > --- a/net/vmw_vsock/virtio_transport.c
> > +++ b/net/vmw_vsock/virtio_transport.c
> > @@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
> >  			break;
> >  		}
> >  
> > +		pkt->buf_len = buf_len;
> >  		pkt->len = buf_len;
> >  
> >  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> > index 602715fc9a75..0248d6808755 100644
> > --- a/net/vmw_vsock/virtio_transport_common.c
> > +++ b/net/vmw_vsock/virtio_transport_common.c
> > @@ -65,6 +65,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
> >  		pkt->buf = kmalloc(len, GFP_KERNEL);
> >  		if (!pkt->buf)
> >  			goto out_pkt;
> > +
> > +		pkt->buf_len = len;
> > +
> >  		err = memcpy_from_msg(pkt->buf, info->msg, len);
> >  		if (err)
> >  			goto out;
> > @@ -86,6 +89,46 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
> >  	return NULL;
> >  }
> >  
> > +static struct virtio_vsock_buf *
> > +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> > +{
> > +	struct virtio_vsock_buf *buf;
> > +
> > +	if (pkt->len == 0)
> > +		return NULL;
> > +
> > +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> > +	if (!buf)
> > +		return NULL;
> > +
> > +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> > +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> > +	 * we are not use
> 
> we do not use
> 

Oh thanks! Will fix!

> > more memory than that counted by the credit mechanism.
> > +	 */
> > +	if (zero_copy && pkt->len == pkt->buf_len) {
> > +		buf->addr = pkt->buf;
> > +		pkt->buf = NULL;
> > +	} else {
> > +		buf->addr = kmalloc(pkt->len, GFP_KERNEL);
> > +		if (!buf->addr) {
> > +			kfree(buf);
> > +			return NULL;
> > +		}
> > +
> > +		memcpy(buf->addr, pkt->buf, pkt->len);
> > +	}
> > +
> > +	buf->len = pkt->len;
> > +
> > +	return buf;
> > +}
> > +
> > +static void virtio_transport_free_buf(struct virtio_vsock_buf *buf)
> > +{
> > +	kfree(buf->addr);
> > +	kfree(buf);
> > +}
> > +
> >  /* Packet capture */
> >  static struct sk_buff *virtio_transport_build_skb(void *opaque)
> >  {
> > @@ -190,17 +233,15 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> >  	return virtio_transport_get_ops()->send_pkt(pkt);
> >  }
> >  
> > -static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
> > -					struct virtio_vsock_pkt *pkt)
> > +static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len)
> >  {
> > -	vvs->rx_bytes += pkt->len;
> > +	vvs->rx_bytes += len;
> >  }
> >  
> > -static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
> > -					struct virtio_vsock_pkt *pkt)
> > +static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, u32 len)
> >  {
> > -	vvs->rx_bytes -= pkt->len;
> > -	vvs->fwd_cnt += pkt->len;
> > +	vvs->rx_bytes -= len;
> > +	vvs->fwd_cnt += len;
> >  }
> >  
> >  void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
> > @@ -254,36 +295,36 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
> >  				   size_t len)
> >  {
> >  	struct virtio_vsock_sock *vvs = vsk->trans;
> > -	struct virtio_vsock_pkt *pkt;
> > +	struct virtio_vsock_buf *buf;
> >  	size_t bytes, total = 0;
> >  	int err = -EFAULT;
> >  
> >  	spin_lock_bh(&vvs->rx_lock);
> >  	while (total < len && !list_empty(&vvs->rx_queue)) {
> > -		pkt = list_first_entry(&vvs->rx_queue,
> > -				       struct virtio_vsock_pkt, list);
> > +		buf = list_first_entry(&vvs->rx_queue,
> > +				       struct virtio_vsock_buf, list);
> >  
> >  		bytes = len - total;
> > -		if (bytes > pkt->len - pkt->off)
> > -			bytes = pkt->len - pkt->off;
> > +		if (bytes > buf->len - buf->off)
> > +			bytes = buf->len - buf->off;
> >  
> >  		/* sk_lock is held by caller so no one else can dequeue.
> >  		 * Unlock rx_lock since memcpy_to_msg() may sleep.
> >  		 */
> >  		spin_unlock_bh(&vvs->rx_lock);
> >  
> > -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> > +		err = memcpy_to_msg(msg, buf->addr + buf->off, bytes);
> >  		if (err)
> >  			goto out;
> >  
> >  		spin_lock_bh(&vvs->rx_lock);
> >  
> >  		total += bytes;
> > -		pkt->off += bytes;
> > -		if (pkt->off == pkt->len) {
> > -			virtio_transport_dec_rx_pkt(vvs, pkt);
> > -			list_del(&pkt->list);
> > -			virtio_transport_free_pkt(pkt);
> > +		buf->off += bytes;
> > +		if (buf->off == buf->len) {
> > +			virtio_transport_dec_rx_pkt(vvs, buf->len);
> > +			list_del(&buf->list);
> > +			virtio_transport_free_buf(buf);
> >  		}
> >  	}
> >  	spin_unlock_bh(&vvs->rx_lock);
> > @@ -841,20 +882,24 @@ virtio_transport_recv_connected(struct sock *sk,
> >  {
> >  	struct vsock_sock *vsk = vsock_sk(sk);
> >  	struct virtio_vsock_sock *vvs = vsk->trans;
> > +	struct virtio_vsock_buf *buf;
> >  	int err = 0;
> >  
> >  	switch (le16_to_cpu(pkt->hdr.op)) {
> >  	case VIRTIO_VSOCK_OP_RW:
> >  		pkt->len = le32_to_cpu(pkt->hdr.len);
> > -		pkt->off = 0;
> > +		buf = virtio_transport_alloc_buf(pkt, true);
> 
> 
> This seems to be the only callers and second parameter
> is always true. So why is it needed?

Right. It was a leftover, I'll remove it.

> 
> >  
> > -		spin_lock_bh(&vvs->rx_lock);
> > -		virtio_transport_inc_rx_pkt(vvs, pkt);
> > -		list_add_tail(&pkt->list, &vvs->rx_queue);
> > -		spin_unlock_bh(&vvs->rx_lock);
> > +		if (buf) {
> > +			spin_lock_bh(&vvs->rx_lock);
> > +			virtio_transport_inc_rx_pkt(vvs, pkt->len);
> > +			list_add_tail(&buf->list, &vvs->rx_queue);
> > +			spin_unlock_bh(&vvs->rx_lock);
> >  
> > -		sk->sk_data_ready(sk);
> > -		return err;
> > +			sk->sk_data_ready(sk);
> > +		}
> > +
> > +		break;
> >  	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
> >  		sk->sk_write_space(sk);
> >  		break;

Thanks for the review,
Stefano
Stefano Garzarella May 13, 2019, 5:23 p.m. UTC | #4
On Mon, May 13, 2019 at 05:58:53PM +0800, Jason Wang wrote:
> 
> On 2019/5/10 下午8:58, Stefano Garzarella wrote:
> > Since virtio-vsock was introduced, the buffers filled by the host
> > and pushed to the guest using the vring, are directly queued in
> > a per-socket list avoiding to copy it.
> > These buffers are preallocated by the guest with a fixed
> > size (4 KB).
> > 
> > The maximum amount of memory used by each socket should be
> > controlled by the credit mechanism.
> > The default credit available per-socket is 256 KB, but if we use
> > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > buffers, using up to 1 GB of memory per-socket. In addition, the
> > guest will continue to fill the vring with new 4 KB free buffers
> > to avoid starvation of other sockets.
> > 
> > This patch solves this issue copying the payload in a new buffer.
> > Then it is queued in the per-socket list, and the 4KB buffer used
> > by the host is freed.
> > 
> > In this way, the memory used by each socket respects the credit
> > available, and we still avoid starvation, paying the cost of an
> > extra memory copy. When the buffer is completely full we do a
> > "zero-copy", moving the buffer directly in the per-socket list.
> 
> 
> I wonder in the long run we should use generic socket accouting mechanism
> provided by kernel (e.g socket, skb, sndbuf, recvbug, truesize) instead of
> vsock specific thing to avoid duplicating efforts.

I agree, the idea is to switch to sk_buff but this should require an huge
change. If we will use the virtio-net datapath, it will become simpler.

> 
> 
> > 
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > ---
> >   drivers/vhost/vsock.c                   |  2 +
> >   include/linux/virtio_vsock.h            |  8 +++
> >   net/vmw_vsock/virtio_transport.c        |  1 +
> >   net/vmw_vsock/virtio_transport_common.c | 95 ++++++++++++++++++-------
> >   4 files changed, 81 insertions(+), 25 deletions(-)
> > 
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index bb5fc0e9fbc2..7964e2daee09 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
> >   		return NULL;
> >   	}
> > +	pkt->buf_len = pkt->len;
> > +
> >   	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
> >   	if (nbytes != pkt->len) {
> >   		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > index e223e2632edd..345f04ee9193 100644
> > --- a/include/linux/virtio_vsock.h
> > +++ b/include/linux/virtio_vsock.h
> > @@ -54,9 +54,17 @@ struct virtio_vsock_pkt {
> >   	void *buf;
> >   	u32 len;
> >   	u32 off;
> > +	u32 buf_len;
> >   	bool reply;
> >   };
> > +struct virtio_vsock_buf {
> > +	struct list_head list;
> > +	void *addr;
> > +	u32 len;
> > +	u32 off;
> > +};
> > +
> >   struct virtio_vsock_pkt_info {
> >   	u32 remote_cid, remote_port;
> >   	struct vsock_sock *vsk;
> > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> > index 15eb5d3d4750..af1d2ce12f54 100644
> > --- a/net/vmw_vsock/virtio_transport.c
> > +++ b/net/vmw_vsock/virtio_transport.c
> > @@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
> >   			break;
> >   		}
> > +		pkt->buf_len = buf_len;
> >   		pkt->len = buf_len;
> >   		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> > index 602715fc9a75..0248d6808755 100644
> > --- a/net/vmw_vsock/virtio_transport_common.c
> > +++ b/net/vmw_vsock/virtio_transport_common.c
> > @@ -65,6 +65,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
> >   		pkt->buf = kmalloc(len, GFP_KERNEL);
> >   		if (!pkt->buf)
> >   			goto out_pkt;
> > +
> > +		pkt->buf_len = len;
> > +
> >   		err = memcpy_from_msg(pkt->buf, info->msg, len);
> >   		if (err)
> >   			goto out;
> > @@ -86,6 +89,46 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
> >   	return NULL;
> >   }
> > +static struct virtio_vsock_buf *
> > +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> > +{
> > +	struct virtio_vsock_buf *buf;
> > +
> > +	if (pkt->len == 0)
> > +		return NULL;
> > +
> > +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> > +	if (!buf)
> > +		return NULL;
> > +
> > +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> > +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> > +	 * we are not use more memory than that counted by the credit mechanism.
> > +	 */
> > +	if (zero_copy && pkt->len == pkt->buf_len) {
> > +		buf->addr = pkt->buf;
> > +		pkt->buf = NULL;
> > +	} else {
> 
> 
> Is the copy still needed if we're just few bytes less? We meet similar issue
> for virito-net, and virtio-net solve this by always copy first 128bytes for
> big packets.
> 
> See receive_big()

I'm seeing, It is more sophisticated.
IIUC, virtio-net allocates a sk_buff with 128 bytes of buffer, then copies the
first 128 bytes, then adds the buffer used to receive the packet as a frag to
the skb.

Do you suggest to implement something similar, or for now we can use my
approach and if we will merge the datapath we can reuse the virtio-net
approach?

Thanks,
Stefano
Jason Wang May 14, 2019, 3:25 a.m. UTC | #5
On 2019/5/14 上午1:23, Stefano Garzarella wrote:
> On Mon, May 13, 2019 at 05:58:53PM +0800, Jason Wang wrote:
>> On 2019/5/10 下午8:58, Stefano Garzarella wrote:
>>> Since virtio-vsock was introduced, the buffers filled by the host
>>> and pushed to the guest using the vring, are directly queued in
>>> a per-socket list avoiding to copy it.
>>> These buffers are preallocated by the guest with a fixed
>>> size (4 KB).
>>>
>>> The maximum amount of memory used by each socket should be
>>> controlled by the credit mechanism.
>>> The default credit available per-socket is 256 KB, but if we use
>>> only 1 byte per packet, the guest can queue up to 262144 of 4 KB
>>> buffers, using up to 1 GB of memory per-socket. In addition, the
>>> guest will continue to fill the vring with new 4 KB free buffers
>>> to avoid starvation of her sockets.
>>>
>>> This patch solves this issue copying the payload in a new buffer.
>>> Then it is queued in the per-socket list, and the 4KB buffer used
>>> by the host is freed.
>>>
>>> In this way, the memory used by each socket respects the credit
>>> available, and we still avoid starvation, paying the cost of an
>>> extra memory copy. When the buffer is completely full we do a
>>> "zero-copy", moving the buffer directly in the per-socket list.
>>
>> I wonder in the long run we should use generic socket accouting mechanism
>> provided by kernel (e.g socket, skb, sndbuf, recvbug, truesize) instead of
>> vsock specific thing to avoid duplicating efforts.
> I agree, the idea is to switch to sk_buff but this should require an huge
> change. If we will use the virtio-net datapath, it will become simpler.


Yes, unix domain socket is one example that uses general skb and socket 
structure. And we probably need some kind of socket pair on host. Using 
socket can also simplify the unification with vhost-net which depends on 
the socket proto_ops to work. I admit it's a huge change probably, we 
can do it gradually.


>>
>>> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>>> ---
>>>    drivers/vhost/vsock.c                   |  2 +
>>>    include/linux/virtio_vsock.h            |  8 +++
>>>    net/vmw_vsock/virtio_transport.c        |  1 +
>>>    net/vmw_vsock/virtio_transport_common.c | 95 ++++++++++++++++++-------
>>>    4 files changed, 81 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> index bb5fc0e9fbc2..7964e2daee09 100644
>>> --- a/drivers/vhost/vsock.c
>>> +++ b/drivers/vhost/vsock.c
>>> @@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
>>>    		return NULL;
>>>    	}
>>> +	pkt->buf_len = pkt->len;
>>> +
>>>    	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>>>    	if (nbytes != pkt->len) {
>>>    		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>> index e223e2632edd..345f04ee9193 100644
>>> --- a/include/linux/virtio_vsock.h
>>> +++ b/include/linux/virtio_vsock.h
>>> @@ -54,9 +54,17 @@ struct virtio_vsock_pkt {
>>>    	void *buf;
>>>    	u32 len;
>>>    	u32 off;
>>> +	u32 buf_len;
>>>    	bool reply;
>>>    };
>>> +struct virtio_vsock_buf {
>>> +	struct list_head list;
>>> +	void *addr;
>>> +	u32 len;
>>> +	u32 off;
>>> +};
>>> +
>>>    struct virtio_vsock_pkt_info {
>>>    	u32 remote_cid, remote_port;
>>>    	struct vsock_sock *vsk;
>>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>>> index 15eb5d3d4750..af1d2ce12f54 100644
>>> --- a/net/vmw_vsock/virtio_transport.c
>>> +++ b/net/vmw_vsock/virtio_transport.c
>>> @@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
>>>    			break;
>>>    		}
>>> +		pkt->buf_len = buf_len;
>>>    		pkt->len = buf_len;
>>>    		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>> index 602715fc9a75..0248d6808755 100644
>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> @@ -65,6 +65,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>>>    		pkt->buf = kmalloc(len, GFP_KERNEL);
>>>    		if (!pkt->buf)
>>>    			goto out_pkt;
>>> +
>>> +		pkt->buf_len = len;
>>> +
>>>    		err = memcpy_from_msg(pkt->buf, info->msg, len);
>>>    		if (err)
>>>    			goto out;
>>> @@ -86,6 +89,46 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>>>    	return NULL;
>>>    }
>>> +static struct virtio_vsock_buf *
>>> +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
>>> +{
>>> +	struct virtio_vsock_buf *buf;
>>> +
>>> +	if (pkt->len == 0)
>>> +		return NULL;
>>> +
>>> +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
>>> +	if (!buf)
>>> +		return NULL;
>>> +
>>> +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
>>> +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
>>> +	 * we are not use more memory than that counted by the credit mechanism.
>>> +	 */
>>> +	if (zero_copy && pkt->len == pkt->buf_len) {
>>> +		buf->addr = pkt->buf;
>>> +		pkt->buf = NULL;
>>> +	} else {
>>
>> Is the copy still needed if we're just few bytes less? We meet similar issue
>> for virito-net, and virtio-net solve this by always copy first 128bytes for
>> big packets.
>>
>> See receive_big()
> I'm seeing, It is more sophisticated.
> IIUC, virtio-net allocates a sk_buff with 128 bytes of buffer, then copies the
> first 128 bytes, then adds the buffer used to receive the packet as a frag to
> the skb.


Yes and the point is if the packet is smaller than 128 bytes the pages 
will be recycled.


>
> Do you suggest to implement something similar, or for now we can use my
> approach and if we will merge the datapath we can reuse the virtio-net
> approach?


I think we need a better threshold. If I understand the patch correctly, 
we will do copy unless the packet is 64K when guest is doing receiving. 
1 byte packet is indeed a problem, but we need to solve it without 
losing too much performance.

Thanks


>
> Thanks,
> Stefano
Jason Wang May 14, 2019, 3:40 a.m. UTC | #6
On 2019/5/14 上午11:25, Jason Wang wrote:
>
> On 2019/5/14 上午1:23, Stefano Garzarella wrote:
>> On Mon, May 13, 2019 at 05:58:53PM +0800, Jason Wang wrote:
>>> On 2019/5/10 下午8:58, Stefano Garzarella wrote:
>>>> Since virtio-vsock was introduced, the buffers filled by the host
>>>> and pushed to the guest using the vring, are directly queued in
>>>> a per-socket list avoiding to copy it.
>>>> These buffers are preallocated by the guest with a fixed
>>>> size (4 KB).
>>>>
>>>> The maximum amount of memory used by each socket should be
>>>> controlled by the credit mechanism.
>>>> The default credit available per-socket is 256 KB, but if we use
>>>> only 1 byte per packet, the guest can queue up to 262144 of 4 KB
>>>> buffers, using up to 1 GB of memory per-socket. In addition, the
>>>> guest will continue to fill the vring with new 4 KB free buffers
>>>> to avoid starvation of her sockets.
>>>>
>>>> This patch solves this issue copying the payload in a new buffer.
>>>> Then it is queued in the per-socket list, and the 4KB buffer used
>>>> by the host is freed.
>>>>
>>>> In this way, the memory used by each socket respects the credit
>>>> available, and we still avoid starvation, paying the cost of an
>>>> extra memory copy. When the buffer is completely full we do a
>>>> "zero-copy", moving the buffer directly in the per-socket list.
>>>
>>> I wonder in the long run we should use generic socket accouting 
>>> mechanism
>>> provided by kernel (e.g socket, skb, sndbuf, recvbug, truesize) 
>>> instead of
>>> vsock specific thing to avoid duplicating efforts.
>> I agree, the idea is to switch to sk_buff but this should require an 
>> huge
>> change. If we will use the virtio-net datapath, it will become simpler.
>
>
> Yes, unix domain socket is one example that uses general skb and 
> socket structure. And we probably need some kind of socket pair on 
> host. Using socket can also simplify the unification with vhost-net 
> which depends on the socket proto_ops to work. I admit it's a huge 
> change probably, we can do it gradually.
>
>
>>>
>>>> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>>>> ---
>>>>    drivers/vhost/vsock.c                   |  2 +
>>>>    include/linux/virtio_vsock.h            |  8 +++
>>>>    net/vmw_vsock/virtio_transport.c        |  1 +
>>>>    net/vmw_vsock/virtio_transport_common.c | 95 
>>>> ++++++++++++++++++-------
>>>>    4 files changed, 81 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>> index bb5fc0e9fbc2..7964e2daee09 100644
>>>> --- a/drivers/vhost/vsock.c
>>>> +++ b/drivers/vhost/vsock.c
>>>> @@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
>>>>            return NULL;
>>>>        }
>>>> +    pkt->buf_len = pkt->len;
>>>> +
>>>>        nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>>>>        if (nbytes != pkt->len) {
>>>>            vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>>>> diff --git a/include/linux/virtio_vsock.h 
>>>> b/include/linux/virtio_vsock.h
>>>> index e223e2632edd..345f04ee9193 100644
>>>> --- a/include/linux/virtio_vsock.h
>>>> +++ b/include/linux/virtio_vsock.h
>>>> @@ -54,9 +54,17 @@ struct virtio_vsock_pkt {
>>>>        void *buf;
>>>>        u32 len;
>>>>        u32 off;
>>>> +    u32 buf_len;
>>>>        bool reply;
>>>>    };
>>>> +struct virtio_vsock_buf {
>>>> +    struct list_head list;
>>>> +    void *addr;
>>>> +    u32 len;
>>>> +    u32 off;
>>>> +};
>>>> +
>>>>    struct virtio_vsock_pkt_info {
>>>>        u32 remote_cid, remote_port;
>>>>        struct vsock_sock *vsk;
>>>> diff --git a/net/vmw_vsock/virtio_transport.c 
>>>> b/net/vmw_vsock/virtio_transport.c
>>>> index 15eb5d3d4750..af1d2ce12f54 100644
>>>> --- a/net/vmw_vsock/virtio_transport.c
>>>> +++ b/net/vmw_vsock/virtio_transport.c
>>>> @@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct 
>>>> virtio_vsock *vsock)
>>>>                break;
>>>>            }
>>>> +        pkt->buf_len = buf_len;
>>>>            pkt->len = buf_len;
>>>>            sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>>>> b/net/vmw_vsock/virtio_transport_common.c
>>>> index 602715fc9a75..0248d6808755 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -65,6 +65,9 @@ virtio_transport_alloc_pkt(struct 
>>>> virtio_vsock_pkt_info *info,
>>>>            pkt->buf = kmalloc(len, GFP_KERNEL);
>>>>            if (!pkt->buf)
>>>>                goto out_pkt;
>>>> +
>>>> +        pkt->buf_len = len;
>>>> +
>>>>            err = memcpy_from_msg(pkt->buf, info->msg, len);
>>>>            if (err)
>>>>                goto out;
>>>> @@ -86,6 +89,46 @@ virtio_transport_alloc_pkt(struct 
>>>> virtio_vsock_pkt_info *info,
>>>>        return NULL;
>>>>    }
>>>> +static struct virtio_vsock_buf *
>>>> +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool 
>>>> zero_copy)
>>>> +{
>>>> +    struct virtio_vsock_buf *buf;
>>>> +
>>>> +    if (pkt->len == 0)
>>>> +        return NULL;
>>>> +
>>>> +    buf = kzalloc(sizeof(*buf), GFP_KERNEL);
>>>> +    if (!buf)
>>>> +        return NULL;
>>>> +
>>>> +    /* If the buffer in the virtio_vsock_pkt is full, we can move 
>>>> it to
>>>> +     * the new virtio_vsock_buf avoiding the copy, because we are 
>>>> sure that
>>>> +     * we are not use more memory than that counted by the credit 
>>>> mechanism.
>>>> +     */
>>>> +    if (zero_copy && pkt->len == pkt->buf_len) {
>>>> +        buf->addr = pkt->buf;
>>>> +        pkt->buf = NULL;
>>>> +    } else {
>>>
>>> Is the copy still needed if we're just few bytes less? We meet 
>>> similar issue
>>> for virito-net, and virtio-net solve this by always copy first 
>>> 128bytes for
>>> big packets.
>>>
>>> See receive_big()
>> I'm seeing, It is more sophisticated.
>> IIUC, virtio-net allocates a sk_buff with 128 bytes of buffer, then 
>> copies the
>> first 128 bytes, then adds the buffer used to receive the packet as a 
>> frag to
>> the skb.
>
>
> Yes and the point is if the packet is smaller than 128 bytes the pages 
> will be recycled. 


To be clear, this only work if you use order 0 page instead of a large 
buffer that is allocated through kmalloc(). Another requirement for 
order 0 page.

Thanks
Stefano Garzarella May 14, 2019, 4:35 p.m. UTC | #7
On Tue, May 14, 2019 at 11:25:34AM +0800, Jason Wang wrote:
> 
> On 2019/5/14 上午1:23, Stefano Garzarella wrote:
> > On Mon, May 13, 2019 at 05:58:53PM +0800, Jason Wang wrote:
> > > On 2019/5/10 下午8:58, Stefano Garzarella wrote:
> > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > and pushed to the guest using the vring, are directly queued in
> > > > a per-socket list avoiding to copy it.
> > > > These buffers are preallocated by the guest with a fixed
> > > > size (4 KB).
> > > > 
> > > > The maximum amount of memory used by each socket should be
> > > > controlled by the credit mechanism.
> > > > The default credit available per-socket is 256 KB, but if we use
> > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > to avoid starvation of her sockets.
> > > > 
> > > > This patch solves this issue copying the payload in a new buffer.
> > > > Then it is queued in the per-socket list, and the 4KB buffer used
> > > > by the host is freed.
> > > > 
> > > > In this way, the memory used by each socket respects the credit
> > > > available, and we still avoid starvation, paying the cost of an
> > > > extra memory copy. When the buffer is completely full we do a
> > > > "zero-copy", moving the buffer directly in the per-socket list.
> > > 
> > > I wonder in the long run we should use generic socket accouting mechanism
> > > provided by kernel (e.g socket, skb, sndbuf, recvbug, truesize) instead of
> > > vsock specific thing to avoid duplicating efforts.
> > I agree, the idea is to switch to sk_buff but this should require an huge
> > change. If we will use the virtio-net datapath, it will become simpler.
> 
> 
> Yes, unix domain socket is one example that uses general skb and socket
> structure. And we probably need some kind of socket pair on host. Using
> socket can also simplify the unification with vhost-net which depends on the
> socket proto_ops to work. I admit it's a huge change probably, we can do it
> gradually.
> 

Yes, I also prefer to do this change gradually :)

> 
> > > 
> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > ---
> > > >    drivers/vhost/vsock.c                   |  2 +
> > > >    include/linux/virtio_vsock.h            |  8 +++
> > > >    net/vmw_vsock/virtio_transport.c        |  1 +
> > > >    net/vmw_vsock/virtio_transport_common.c | 95 ++++++++++++++++++-------
> > > >    4 files changed, 81 insertions(+), 25 deletions(-)
> > > > 
> > > > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > > > index bb5fc0e9fbc2..7964e2daee09 100644
> > > > --- a/drivers/vhost/vsock.c
> > > > +++ b/drivers/vhost/vsock.c
> > > > @@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
> > > >    		return NULL;
> > > >    	}
> > > > +	pkt->buf_len = pkt->len;
> > > > +
> > > >    	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
> > > >    	if (nbytes != pkt->len) {
> > > >    		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> > > > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > > > index e223e2632edd..345f04ee9193 100644
> > > > --- a/include/linux/virtio_vsock.h
> > > > +++ b/include/linux/virtio_vsock.h
> > > > @@ -54,9 +54,17 @@ struct virtio_vsock_pkt {
> > > >    	void *buf;
> > > >    	u32 len;
> > > >    	u32 off;
> > > > +	u32 buf_len;
> > > >    	bool reply;
> > > >    };
> > > > +struct virtio_vsock_buf {
> > > > +	struct list_head list;
> > > > +	void *addr;
> > > > +	u32 len;
> > > > +	u32 off;
> > > > +};
> > > > +
> > > >    struct virtio_vsock_pkt_info {
> > > >    	u32 remote_cid, remote_port;
> > > >    	struct vsock_sock *vsk;
> > > > diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> > > > index 15eb5d3d4750..af1d2ce12f54 100644
> > > > --- a/net/vmw_vsock/virtio_transport.c
> > > > +++ b/net/vmw_vsock/virtio_transport.c
> > > > @@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
> > > >    			break;
> > > >    		}
> > > > +		pkt->buf_len = buf_len;
> > > >    		pkt->len = buf_len;
> > > >    		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> > > > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> > > > index 602715fc9a75..0248d6808755 100644
> > > > --- a/net/vmw_vsock/virtio_transport_common.c
> > > > +++ b/net/vmw_vsock/virtio_transport_common.c
> > > > @@ -65,6 +65,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
> > > >    		pkt->buf = kmalloc(len, GFP_KERNEL);
> > > >    		if (!pkt->buf)
> > > >    			goto out_pkt;
> > > > +
> > > > +		pkt->buf_len = len;
> > > > +
> > > >    		err = memcpy_from_msg(pkt->buf, info->msg, len);
> > > >    		if (err)
> > > >    			goto out;
> > > > @@ -86,6 +89,46 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
> > > >    	return NULL;
> > > >    }
> > > > +static struct virtio_vsock_buf *
> > > > +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> > > > +{
> > > > +	struct virtio_vsock_buf *buf;
> > > > +
> > > > +	if (pkt->len == 0)
> > > > +		return NULL;
> > > > +
> > > > +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> > > > +	if (!buf)
> > > > +		return NULL;
> > > > +
> > > > +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> > > > +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> > > > +	 * we are not use more memory than that counted by the credit mechanism.
> > > > +	 */
> > > > +	if (zero_copy && pkt->len == pkt->buf_len) {
> > > > +		buf->addr = pkt->buf;
> > > > +		pkt->buf = NULL;
> > > > +	} else {
> > > 
> > > Is the copy still needed if we're just few bytes less? We meet similar issue
> > > for virito-net, and virtio-net solve this by always copy first 128bytes for
> > > big packets.
> > > 
> > > See receive_big()
> > I'm seeing, It is more sophisticated.
> > IIUC, virtio-net allocates a sk_buff with 128 bytes of buffer, then copies the
> > first 128 bytes, then adds the buffer used to receive the packet as a frag to
> > the skb.
> 
> 
> Yes and the point is if the packet is smaller than 128 bytes the pages will
> be recycled.
> 
> 

So it's avoid the overhead of allocation of a large buffer. I got it.

Just a curiosity, why the threshold is 128 bytes?

> > 
> > Do you suggest to implement something similar, or for now we can use my
> > approach and if we will merge the datapath we can reuse the virtio-net
> > approach?
> 
> 
> I think we need a better threshold. If I understand the patch correctly, we
> will do copy unless the packet is 64K when guest is doing receiving. 1 byte
> packet is indeed a problem, but we need to solve it without losing too much
> performance.

It is correct. I'll try to figure out a better threshold and the usage of
order 0 page.

Thanks again for your advices,
Stefano
Jason Wang May 15, 2019, 2:48 a.m. UTC | #8
On 2019/5/15 上午12:35, Stefano Garzarella wrote:
> On Tue, May 14, 2019 at 11:25:34AM +0800, Jason Wang wrote:
>> On 2019/5/14 上午1:23, Stefano Garzarella wrote:
>>> On Mon, May 13, 2019 at 05:58:53PM +0800, Jason Wang wrote:
>>>> On 2019/5/10 下午8:58, Stefano Garzarella wrote:
>>>>> Since virtio-vsock was introduced, the buffers filled by the host
>>>>> and pushed to the guest using the vring, are directly queued in
>>>>> a per-socket list avoiding to copy it.
>>>>> These buffers are preallocated by the guest with a fixed
>>>>> size (4 KB).
>>>>>
>>>>> The maximum amount of memory used by each socket should be
>>>>> controlled by the credit mechanism.
>>>>> The default credit available per-socket is 256 KB, but if we use
>>>>> only 1 byte per packet, the guest can queue up to 262144 of 4 KB
>>>>> buffers, using up to 1 GB of memory per-socket. In addition, the
>>>>> guest will continue to fill the vring with new 4 KB free buffers
>>>>> to avoid starvation of her sockets.
>>>>>
>>>>> This patch solves this issue copying the payload in a new buffer.
>>>>> Then it is queued in the per-socket list, and the 4KB buffer used
>>>>> by the host is freed.
>>>>>
>>>>> In this way, the memory used by each socket respects the credit
>>>>> available, and we still avoid starvation, paying the cost of an
>>>>> extra memory copy. When the buffer is completely full we do a
>>>>> "zero-copy", moving the buffer directly in the per-socket list.
>>>> I wonder in the long run we should use generic socket accouting mechanism
>>>> provided by kernel (e.g socket, skb, sndbuf, recvbug, truesize) instead of
>>>> vsock specific thing to avoid duplicating efforts.
>>> I agree, the idea is to switch to sk_buff but this should require an huge
>>> change. If we will use the virtio-net datapath, it will become simpler.
>>
>> Yes, unix domain socket is one example that uses general skb and socket
>> structure. And we probably need some kind of socket pair on host. Using
>> socket can also simplify the unification with vhost-net which depends on the
>> socket proto_ops to work. I admit it's a huge change probably, we can do it
>> gradually.
>>
> Yes, I also prefer to do this change gradually :)
>
>>>>> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>>>>> ---
>>>>>     drivers/vhost/vsock.c                   |  2 +
>>>>>     include/linux/virtio_vsock.h            |  8 +++
>>>>>     net/vmw_vsock/virtio_transport.c        |  1 +
>>>>>     net/vmw_vsock/virtio_transport_common.c | 95 ++++++++++++++++++-------
>>>>>     4 files changed, 81 insertions(+), 25 deletions(-)
>>>>>
>>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>>> index bb5fc0e9fbc2..7964e2daee09 100644
>>>>> --- a/drivers/vhost/vsock.c
>>>>> +++ b/drivers/vhost/vsock.c
>>>>> @@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
>>>>>     		return NULL;
>>>>>     	}
>>>>> +	pkt->buf_len = pkt->len;
>>>>> +
>>>>>     	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>>>>>     	if (nbytes != pkt->len) {
>>>>>     		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>>>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>>>> index e223e2632edd..345f04ee9193 100644
>>>>> --- a/include/linux/virtio_vsock.h
>>>>> +++ b/include/linux/virtio_vsock.h
>>>>> @@ -54,9 +54,17 @@ struct virtio_vsock_pkt {
>>>>>     	void *buf;
>>>>>     	u32 len;
>>>>>     	u32 off;
>>>>> +	u32 buf_len;
>>>>>     	bool reply;
>>>>>     };
>>>>> +struct virtio_vsock_buf {
>>>>> +	struct list_head list;
>>>>> +	void *addr;
>>>>> +	u32 len;
>>>>> +	u32 off;
>>>>> +};
>>>>> +
>>>>>     struct virtio_vsock_pkt_info {
>>>>>     	u32 remote_cid, remote_port;
>>>>>     	struct vsock_sock *vsk;
>>>>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>>>>> index 15eb5d3d4750..af1d2ce12f54 100644
>>>>> --- a/net/vmw_vsock/virtio_transport.c
>>>>> +++ b/net/vmw_vsock/virtio_transport.c
>>>>> @@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
>>>>>     			break;
>>>>>     		}
>>>>> +		pkt->buf_len = buf_len;
>>>>>     		pkt->len = buf_len;
>>>>>     		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>>>> index 602715fc9a75..0248d6808755 100644
>>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>>> @@ -65,6 +65,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>>>>>     		pkt->buf = kmalloc(len, GFP_KERNEL);
>>>>>     		if (!pkt->buf)
>>>>>     			goto out_pkt;
>>>>> +
>>>>> +		pkt->buf_len = len;
>>>>> +
>>>>>     		err = memcpy_from_msg(pkt->buf, info->msg, len);
>>>>>     		if (err)
>>>>>     			goto out;
>>>>> @@ -86,6 +89,46 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>>>>>     	return NULL;
>>>>>     }
>>>>> +static struct virtio_vsock_buf *
>>>>> +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
>>>>> +{
>>>>> +	struct virtio_vsock_buf *buf;
>>>>> +
>>>>> +	if (pkt->len == 0)
>>>>> +		return NULL;
>>>>> +
>>>>> +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
>>>>> +	if (!buf)
>>>>> +		return NULL;
>>>>> +
>>>>> +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
>>>>> +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
>>>>> +	 * we are not use more memory than that counted by the credit mechanism.
>>>>> +	 */
>>>>> +	if (zero_copy && pkt->len == pkt->buf_len) {
>>>>> +		buf->addr = pkt->buf;
>>>>> +		pkt->buf = NULL;
>>>>> +	} else {
>>>> Is the copy still needed if we're just few bytes less? We meet similar issue
>>>> for virito-net, and virtio-net solve this by always copy first 128bytes for
>>>> big packets.
>>>>
>>>> See receive_big()
>>> I'm seeing, It is more sophisticated.
>>> IIUC, virtio-net allocates a sk_buff with 128 bytes of buffer, then copies the
>>> first 128 bytes, then adds the buffer used to receive the packet as a frag to
>>> the skb.
>>
>> Yes and the point is if the packet is smaller than 128 bytes the pages will
>> be recycled.
>>
>>
> So it's avoid the overhead of allocation of a large buffer. I got it.
>
> Just a curiosity, why the threshold is 128 bytes?


 From its name (GOOD_COPY_LEN), I think it just a value that won't lose 
much performance, e.g the size two cachelines.

Thanks


>
>>> Do you suggest to implement something similar, or for now we can use my
>>> approach and if we will merge the datapath we can reuse the virtio-net
>>> approach?
>>
>> I think we need a better threshold. If I understand the patch correctly, we
>> will do copy unless the packet is 64K when guest is doing receiving. 1 byte
>> packet is indeed a problem, but we need to solve it without losing too much
>> performance.
> It is correct. I'll try to figure out a better threshold and the usage of
> order 0 page.
>
> Thanks again for your advices,
> Stefano
Stefan Hajnoczi May 16, 2019, 3:25 p.m. UTC | #9
On Fri, May 10, 2019 at 02:58:36PM +0200, Stefano Garzarella wrote:
> +struct virtio_vsock_buf {

Please add a comment describing the purpose of this struct and to
differentiate its use from struct virtio_vsock_pkt.

> +static struct virtio_vsock_buf *
> +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> +{
> +	struct virtio_vsock_buf *buf;
> +
> +	if (pkt->len == 0)
> +		return NULL;
> +
> +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> +	if (!buf)
> +		return NULL;
> +
> +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> +	 * we are not use more memory than that counted by the credit mechanism.
> +	 */
> +	if (zero_copy && pkt->len == pkt->buf_len) {
> +		buf->addr = pkt->buf;
> +		pkt->buf = NULL;
> +	} else {
> +		buf->addr = kmalloc(pkt->len, GFP_KERNEL);

buf and buf->addr could be allocated in a single call, though I'm not
sure how big an optimization this is.

> @@ -841,20 +882,24 @@ virtio_transport_recv_connected(struct sock *sk,
>  {
>  	struct vsock_sock *vsk = vsock_sk(sk);
>  	struct virtio_vsock_sock *vvs = vsk->trans;
> +	struct virtio_vsock_buf *buf;
>  	int err = 0;
>  
>  	switch (le16_to_cpu(pkt->hdr.op)) {
>  	case VIRTIO_VSOCK_OP_RW:
>  		pkt->len = le32_to_cpu(pkt->hdr.len);
> -		pkt->off = 0;
> +		buf = virtio_transport_alloc_buf(pkt, true);
>  
> -		spin_lock_bh(&vvs->rx_lock);
> -		virtio_transport_inc_rx_pkt(vvs, pkt);
> -		list_add_tail(&pkt->list, &vvs->rx_queue);
> -		spin_unlock_bh(&vvs->rx_lock);
> +		if (buf) {
> +			spin_lock_bh(&vvs->rx_lock);
> +			virtio_transport_inc_rx_pkt(vvs, pkt->len);
> +			list_add_tail(&buf->list, &vvs->rx_queue);
> +			spin_unlock_bh(&vvs->rx_lock);
>  
> -		sk->sk_data_ready(sk);
> -		return err;
> +			sk->sk_data_ready(sk);
> +		}

The return value of this function isn't used but the code still makes an
effort to return errors.  Please return -ENOMEM when buf == NULL.

If you'd like to remove the return value that's fine too, but please do
it for the whole function to be consistent.
Stefano Garzarella May 17, 2019, 8:25 a.m. UTC | #10
On Thu, May 16, 2019 at 04:25:33PM +0100, Stefan Hajnoczi wrote:
> On Fri, May 10, 2019 at 02:58:36PM +0200, Stefano Garzarella wrote:
> > +struct virtio_vsock_buf {
> 
> Please add a comment describing the purpose of this struct and to
> differentiate its use from struct virtio_vsock_pkt.
> 

Sure, I'll fix it.

> > +static struct virtio_vsock_buf *
> > +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> > +{
> > +	struct virtio_vsock_buf *buf;
> > +
> > +	if (pkt->len == 0)
> > +		return NULL;
> > +
> > +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> > +	if (!buf)
> > +		return NULL;
> > +
> > +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> > +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> > +	 * we are not use more memory than that counted by the credit mechanism.
> > +	 */
> > +	if (zero_copy && pkt->len == pkt->buf_len) {
> > +		buf->addr = pkt->buf;
> > +		pkt->buf = NULL;
> > +	} else {
> > +		buf->addr = kmalloc(pkt->len, GFP_KERNEL);
> 
> buf and buf->addr could be allocated in a single call, though I'm not
> sure how big an optimization this is.
> 

IIUC, in the case of zero-copy I should allocate only the buf,
otherwise I should allocate both buf and buf->addr in a single call
when I'm doing a full-copy.

Is it correct?

> > @@ -841,20 +882,24 @@ virtio_transport_recv_connected(struct sock *sk,
> >  {
> >  	struct vsock_sock *vsk = vsock_sk(sk);
> >  	struct virtio_vsock_sock *vvs = vsk->trans;
> > +	struct virtio_vsock_buf *buf;
> >  	int err = 0;
> >  
> >  	switch (le16_to_cpu(pkt->hdr.op)) {
> >  	case VIRTIO_VSOCK_OP_RW:
> >  		pkt->len = le32_to_cpu(pkt->hdr.len);
> > -		pkt->off = 0;
> > +		buf = virtio_transport_alloc_buf(pkt, true);
> >  
> > -		spin_lock_bh(&vvs->rx_lock);
> > -		virtio_transport_inc_rx_pkt(vvs, pkt);
> > -		list_add_tail(&pkt->list, &vvs->rx_queue);
> > -		spin_unlock_bh(&vvs->rx_lock);
> > +		if (buf) {
> > +			spin_lock_bh(&vvs->rx_lock);
> > +			virtio_transport_inc_rx_pkt(vvs, pkt->len);
> > +			list_add_tail(&buf->list, &vvs->rx_queue);
> > +			spin_unlock_bh(&vvs->rx_lock);
> >  
> > -		sk->sk_data_ready(sk);
> > -		return err;
> > +			sk->sk_data_ready(sk);
> > +		}
> 
> The return value of this function isn't used but the code still makes an
> effort to return errors.  Please return -ENOMEM when buf == NULL.
> 
> If you'd like to remove the return value that's fine too, but please do
> it for the whole function to be consistent.

I'll return -ENOMEM when the allocation fails.

Thanks,
Stefano
Stefan Hajnoczi May 20, 2019, 8:57 a.m. UTC | #11
On Fri, May 17, 2019 at 10:25:05AM +0200, Stefano Garzarella wrote:
> On Thu, May 16, 2019 at 04:25:33PM +0100, Stefan Hajnoczi wrote:
> > On Fri, May 10, 2019 at 02:58:36PM +0200, Stefano Garzarella wrote:
> > > +static struct virtio_vsock_buf *
> > > +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> > > +{
> > > +	struct virtio_vsock_buf *buf;
> > > +
> > > +	if (pkt->len == 0)
> > > +		return NULL;
> > > +
> > > +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> > > +	if (!buf)
> > > +		return NULL;
> > > +
> > > +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> > > +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> > > +	 * we are not use more memory than that counted by the credit mechanism.
> > > +	 */
> > > +	if (zero_copy && pkt->len == pkt->buf_len) {
> > > +		buf->addr = pkt->buf;
> > > +		pkt->buf = NULL;
> > > +	} else {
> > > +		buf->addr = kmalloc(pkt->len, GFP_KERNEL);
> > 
> > buf and buf->addr could be allocated in a single call, though I'm not
> > sure how big an optimization this is.
> > 
> 
> IIUC, in the case of zero-copy I should allocate only the buf,
> otherwise I should allocate both buf and buf->addr in a single call
> when I'm doing a full-copy.
> 
> Is it correct?

Yes, but it's your choice whether optimization is worthwhile.  If it
increases the complexity of the code and doesn't result in a measurable
improvement, then it's not worth it.

Stefan
Stefano Garzarella May 28, 2019, 4:45 p.m. UTC | #12
On Wed, May 15, 2019 at 10:48:44AM +0800, Jason Wang wrote:
> 
> On 2019/5/15 上午12:35, Stefano Garzarella wrote:
> > On Tue, May 14, 2019 at 11:25:34AM +0800, Jason Wang wrote:
> > > On 2019/5/14 上午1:23, Stefano Garzarella wrote:
> > > > On Mon, May 13, 2019 at 05:58:53PM +0800, Jason Wang wrote:
> > > > > On 2019/5/10 下午8:58, Stefano Garzarella wrote:
> > > > > > +static struct virtio_vsock_buf *
> > > > > > +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> > > > > > +{
> > > > > > +	struct virtio_vsock_buf *buf;
> > > > > > +
> > > > > > +	if (pkt->len == 0)
> > > > > > +		return NULL;
> > > > > > +
> > > > > > +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> > > > > > +	if (!buf)
> > > > > > +		return NULL;
> > > > > > +
> > > > > > +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
> > > > > > +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
> > > > > > +	 * we are not use more memory than that counted by the credit mechanism.
> > > > > > +	 */
> > > > > > +	if (zero_copy && pkt->len == pkt->buf_len) {
> > > > > > +		buf->addr = pkt->buf;
> > > > > > +		pkt->buf = NULL;
> > > > > > +	} else {
> > > > > Is the copy still needed if we're just few bytes less? We meet similar issue
> > > > > for virito-net, and virtio-net solve this by always copy first 128bytes for
> > > > > big packets.
> > > > > 
> > > > > See receive_big()
> > > > I'm seeing, It is more sophisticated.
> > > > IIUC, virtio-net allocates a sk_buff with 128 bytes of buffer, then copies the
> > > > first 128 bytes, then adds the buffer used to receive the packet as a frag to
> > > > the skb.
> > > 
> > > Yes and the point is if the packet is smaller than 128 bytes the pages will
> > > be recycled.
> > > 
> > > 
> > So it's avoid the overhead of allocation of a large buffer. I got it.
> > 
> > Just a curiosity, why the threshold is 128 bytes?
> 
> 
> From its name (GOOD_COPY_LEN), I think it just a value that won't lose much
> performance, e.g the size two cachelines.
> 

Jason, Stefan,
since I'm removing the patches to increase the buffers to 64 KiB and I'm
adding a threshold for small packets, I would simplify this patch,
removing the new buffer allocation and copying small packets into the
buffers already queued (if there is a space).
In this way, I should solve the issue of 1 byte packets.

Do you think could be better?

Thanks,
Stefano
Jason Wang May 29, 2019, 12:59 a.m. UTC | #13
On 2019/5/29 上午12:45, Stefano Garzarella wrote:
> On Wed, May 15, 2019 at 10:48:44AM +0800, Jason Wang wrote:
>> On 2019/5/15 上午12:35, Stefano Garzarella wrote:
>>> On Tue, May 14, 2019 at 11:25:34AM +0800, Jason Wang wrote:
>>>> On 2019/5/14 上午1:23, Stefano Garzarella wrote:
>>>>> On Mon, May 13, 2019 at 05:58:53PM +0800, Jason Wang wrote:
>>>>>> On 2019/5/10 下午8:58, Stefano Garzarella wrote:
>>>>>>> +static struct virtio_vsock_buf *
>>>>>>> +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
>>>>>>> +{
>>>>>>> +	struct virtio_vsock_buf *buf;
>>>>>>> +
>>>>>>> +	if (pkt->len == 0)
>>>>>>> +		return NULL;
>>>>>>> +
>>>>>>> +	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
>>>>>>> +	if (!buf)
>>>>>>> +		return NULL;
>>>>>>> +
>>>>>>> +	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
>>>>>>> +	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
>>>>>>> +	 * we are not use more memory than that counted by the credit mechanism.
>>>>>>> +	 */
>>>>>>> +	if (zero_copy && pkt->len == pkt->buf_len) {
>>>>>>> +		buf->addr = pkt->buf;
>>>>>>> +		pkt->buf = NULL;
>>>>>>> +	} else {
>>>>>> Is the copy still needed if we're just few bytes less? We meet similar issue
>>>>>> for virito-net, and virtio-net solve this by always copy first 128bytes for
>>>>>> big packets.
>>>>>>
>>>>>> See receive_big()
>>>>> I'm seeing, It is more sophisticated.
>>>>> IIUC, virtio-net allocates a sk_buff with 128 bytes of buffer, then copies the
>>>>> first 128 bytes, then adds the buffer used to receive the packet as a frag to
>>>>> the skb.
>>>> Yes and the point is if the packet is smaller than 128 bytes the pages will
>>>> be recycled.
>>>>
>>>>
>>> So it's avoid the overhead of allocation of a large buffer. I got it.
>>>
>>> Just a curiosity, why the threshold is 128 bytes?
>>
>>  From its name (GOOD_COPY_LEN), I think it just a value that won't lose much
>> performance, e.g the size two cachelines.
>>
> Jason, Stefan,
> since I'm removing the patches to increase the buffers to 64 KiB and I'm
> adding a threshold for small packets, I would simplify this patch,
> removing the new buffer allocation and copying small packets into the
> buffers already queued (if there is a space).
> In this way, I should solve the issue of 1 byte packets.
>
> Do you think could be better?


I think so.

Thanks


>
> Thanks,
> Stefano
diff mbox series

Patch

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index bb5fc0e9fbc2..7964e2daee09 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -320,6 +320,8 @@  vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
 		return NULL;
 	}
 
+	pkt->buf_len = pkt->len;
+
 	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
 	if (nbytes != pkt->len) {
 		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index e223e2632edd..345f04ee9193 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -54,9 +54,17 @@  struct virtio_vsock_pkt {
 	void *buf;
 	u32 len;
 	u32 off;
+	u32 buf_len;
 	bool reply;
 };
 
+struct virtio_vsock_buf {
+	struct list_head list;
+	void *addr;
+	u32 len;
+	u32 off;
+};
+
 struct virtio_vsock_pkt_info {
 	u32 remote_cid, remote_port;
 	struct vsock_sock *vsk;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 15eb5d3d4750..af1d2ce12f54 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -280,6 +280,7 @@  static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
 			break;
 		}
 
+		pkt->buf_len = buf_len;
 		pkt->len = buf_len;
 
 		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 602715fc9a75..0248d6808755 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -65,6 +65,9 @@  virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
 		pkt->buf = kmalloc(len, GFP_KERNEL);
 		if (!pkt->buf)
 			goto out_pkt;
+
+		pkt->buf_len = len;
+
 		err = memcpy_from_msg(pkt->buf, info->msg, len);
 		if (err)
 			goto out;
@@ -86,6 +89,46 @@  virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
 	return NULL;
 }
 
+static struct virtio_vsock_buf *
+virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
+{
+	struct virtio_vsock_buf *buf;
+
+	if (pkt->len == 0)
+		return NULL;
+
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	/* If the buffer in the virtio_vsock_pkt is full, we can move it to
+	 * the new virtio_vsock_buf avoiding the copy, because we are sure that
+	 * we are not use more memory than that counted by the credit mechanism.
+	 */
+	if (zero_copy && pkt->len == pkt->buf_len) {
+		buf->addr = pkt->buf;
+		pkt->buf = NULL;
+	} else {
+		buf->addr = kmalloc(pkt->len, GFP_KERNEL);
+		if (!buf->addr) {
+			kfree(buf);
+			return NULL;
+		}
+
+		memcpy(buf->addr, pkt->buf, pkt->len);
+	}
+
+	buf->len = pkt->len;
+
+	return buf;
+}
+
+static void virtio_transport_free_buf(struct virtio_vsock_buf *buf)
+{
+	kfree(buf->addr);
+	kfree(buf);
+}
+
 /* Packet capture */
 static struct sk_buff *virtio_transport_build_skb(void *opaque)
 {
@@ -190,17 +233,15 @@  static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	return virtio_transport_get_ops()->send_pkt(pkt);
 }
 
-static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
-					struct virtio_vsock_pkt *pkt)
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len)
 {
-	vvs->rx_bytes += pkt->len;
+	vvs->rx_bytes += len;
 }
 
-static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
-					struct virtio_vsock_pkt *pkt)
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, u32 len)
 {
-	vvs->rx_bytes -= pkt->len;
-	vvs->fwd_cnt += pkt->len;
+	vvs->rx_bytes -= len;
+	vvs->fwd_cnt += len;
 }
 
 void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
@@ -254,36 +295,36 @@  virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 				   size_t len)
 {
 	struct virtio_vsock_sock *vvs = vsk->trans;
-	struct virtio_vsock_pkt *pkt;
+	struct virtio_vsock_buf *buf;
 	size_t bytes, total = 0;
 	int err = -EFAULT;
 
 	spin_lock_bh(&vvs->rx_lock);
 	while (total < len && !list_empty(&vvs->rx_queue)) {
-		pkt = list_first_entry(&vvs->rx_queue,
-				       struct virtio_vsock_pkt, list);
+		buf = list_first_entry(&vvs->rx_queue,
+				       struct virtio_vsock_buf, list);
 
 		bytes = len - total;
-		if (bytes > pkt->len - pkt->off)
-			bytes = pkt->len - pkt->off;
+		if (bytes > buf->len - buf->off)
+			bytes = buf->len - buf->off;
 
 		/* sk_lock is held by caller so no one else can dequeue.
 		 * Unlock rx_lock since memcpy_to_msg() may sleep.
 		 */
 		spin_unlock_bh(&vvs->rx_lock);
 
-		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+		err = memcpy_to_msg(msg, buf->addr + buf->off, bytes);
 		if (err)
 			goto out;
 
 		spin_lock_bh(&vvs->rx_lock);
 
 		total += bytes;
-		pkt->off += bytes;
-		if (pkt->off == pkt->len) {
-			virtio_transport_dec_rx_pkt(vvs, pkt);
-			list_del(&pkt->list);
-			virtio_transport_free_pkt(pkt);
+		buf->off += bytes;
+		if (buf->off == buf->len) {
+			virtio_transport_dec_rx_pkt(vvs, buf->len);
+			list_del(&buf->list);
+			virtio_transport_free_buf(buf);
 		}
 	}
 	spin_unlock_bh(&vvs->rx_lock);
@@ -841,20 +882,24 @@  virtio_transport_recv_connected(struct sock *sk,
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
 	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct virtio_vsock_buf *buf;
 	int err = 0;
 
 	switch (le16_to_cpu(pkt->hdr.op)) {
 	case VIRTIO_VSOCK_OP_RW:
 		pkt->len = le32_to_cpu(pkt->hdr.len);
-		pkt->off = 0;
+		buf = virtio_transport_alloc_buf(pkt, true);
 
-		spin_lock_bh(&vvs->rx_lock);
-		virtio_transport_inc_rx_pkt(vvs, pkt);
-		list_add_tail(&pkt->list, &vvs->rx_queue);
-		spin_unlock_bh(&vvs->rx_lock);
+		if (buf) {
+			spin_lock_bh(&vvs->rx_lock);
+			virtio_transport_inc_rx_pkt(vvs, pkt->len);
+			list_add_tail(&buf->list, &vvs->rx_queue);
+			spin_unlock_bh(&vvs->rx_lock);
 
-		sk->sk_data_ready(sk);
-		return err;
+			sk->sk_data_ready(sk);
+		}
+
+		break;
 	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
 		sk->sk_write_space(sk);
 		break;