[v4,1/5] vsock/virtio: limit the memory used per-socket
diff mbox series

Message ID 20190717113030.163499-2-sgarzare@redhat.com
State New
Headers show
Series
  • vsock/virtio: optimizations to increase the throughput
Related show

Commit Message

Stefano Garzarella July 17, 2019, 11:30 a.m. UTC
Since virtio-vsock was introduced, the buffers filled by the host
and pushed to the guest using the vring, are directly queued in
a per-socket list. These buffers are preallocated by the guest
with a fixed size (4 KB).

The maximum amount of memory used by each socket should be
controlled by the credit mechanism.
The default credit available per-socket is 256 KB, but if we use
only 1 byte per packet, the guest can queue up to 262144 of 4 KB
buffers, using up to 1 GB of memory per-socket. In addition, the
guest will continue to fill the vring with new 4 KB free buffers
to avoid starvation of other sockets.

This patch mitigates this issue copying the payload of small
packets (< 128 bytes) into the buffer of last packet queued, in
order to avoid wasting memory.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/vhost/vsock.c                   |  2 +
 include/linux/virtio_vsock.h            |  1 +
 net/vmw_vsock/virtio_transport.c        |  1 +
 net/vmw_vsock/virtio_transport_common.c | 60 +++++++++++++++++++++----
 4 files changed, 55 insertions(+), 9 deletions(-)

Comments

Michael S. Tsirkin July 29, 2019, 2:04 p.m. UTC | #1
On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> Since virtio-vsock was introduced, the buffers filled by the host
> and pushed to the guest using the vring, are directly queued in
> a per-socket list. These buffers are preallocated by the guest
> with a fixed size (4 KB).
> 
> The maximum amount of memory used by each socket should be
> controlled by the credit mechanism.
> The default credit available per-socket is 256 KB, but if we use
> only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> buffers, using up to 1 GB of memory per-socket. In addition, the
> guest will continue to fill the vring with new 4 KB free buffers
> to avoid starvation of other sockets.
> 
> This patch mitigates this issue copying the payload of small
> packets (< 128 bytes) into the buffer of last packet queued, in
> order to avoid wasting memory.
> 
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>

This is good enough for net-next, but for net I think we
should figure out how to address the issue completely.
Can we make the accounting precise? What happens to
performance if we do?


> ---
>  drivers/vhost/vsock.c                   |  2 +
>  include/linux/virtio_vsock.h            |  1 +
>  net/vmw_vsock/virtio_transport.c        |  1 +
>  net/vmw_vsock/virtio_transport_common.c | 60 +++++++++++++++++++++----
>  4 files changed, 55 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index 6a50e1d0529c..6c8390a2af52 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -329,6 +329,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
>  		return NULL;
>  	}
>  
> +	pkt->buf_len = pkt->len;
> +
>  	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>  	if (nbytes != pkt->len) {
>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index e223e2632edd..7d973903f52e 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -52,6 +52,7 @@ struct virtio_vsock_pkt {
>  	/* socket refcnt not held, only use for cancellation */
>  	struct vsock_sock *vsk;
>  	void *buf;
> +	u32 buf_len;
>  	u32 len;
>  	u32 off;
>  	bool reply;
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index 0815d1357861..082a30936690 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -307,6 +307,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
>  			break;
>  		}
>  
> +		pkt->buf_len = buf_len;
>  		pkt->len = buf_len;
>  
>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 6f1a8aff65c5..095221f94786 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -26,6 +26,9 @@
>  /* How long to wait for graceful shutdown of a connection */
>  #define VSOCK_CLOSE_TIMEOUT (8 * HZ)
>  
> +/* Threshold for detecting small packets to copy */
> +#define GOOD_COPY_LEN  128
> +
>  static const struct virtio_transport *virtio_transport_get_ops(void)
>  {
>  	const struct vsock_transport *t = vsock_core_get_transport();
> @@ -64,6 +67,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
>  		pkt->buf = kmalloc(len, GFP_KERNEL);
>  		if (!pkt->buf)
>  			goto out_pkt;
> +
> +		pkt->buf_len = len;
> +
>  		err = memcpy_from_msg(pkt->buf, info->msg, len);
>  		if (err)
>  			goto out;
> @@ -841,24 +847,60 @@ virtio_transport_recv_connecting(struct sock *sk,
>  	return err;
>  }
>  
> +static void
> +virtio_transport_recv_enqueue(struct vsock_sock *vsk,
> +			      struct virtio_vsock_pkt *pkt)
> +{
> +	struct virtio_vsock_sock *vvs = vsk->trans;
> +	bool free_pkt = false;
> +
> +	pkt->len = le32_to_cpu(pkt->hdr.len);
> +	pkt->off = 0;
> +
> +	spin_lock_bh(&vvs->rx_lock);
> +
> +	virtio_transport_inc_rx_pkt(vvs, pkt);
> +
> +	/* Try to copy small packets into the buffer of last packet queued,
> +	 * to avoid wasting memory queueing the entire buffer with a small
> +	 * payload.
> +	 */
> +	if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) {
> +		struct virtio_vsock_pkt *last_pkt;
> +
> +		last_pkt = list_last_entry(&vvs->rx_queue,
> +					   struct virtio_vsock_pkt, list);
> +
> +		/* If there is space in the last packet queued, we copy the
> +		 * new packet in its buffer.
> +		 */
> +		if (pkt->len <= last_pkt->buf_len - last_pkt->len) {
> +			memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
> +			       pkt->len);
> +			last_pkt->len += pkt->len;
> +			free_pkt = true;
> +			goto out;
> +		}
> +	}
> +
> +	list_add_tail(&pkt->list, &vvs->rx_queue);
> +
> +out:
> +	spin_unlock_bh(&vvs->rx_lock);
> +	if (free_pkt)
> +		virtio_transport_free_pkt(pkt);
> +}
> +
>  static int
>  virtio_transport_recv_connected(struct sock *sk,
>  				struct virtio_vsock_pkt *pkt)
>  {
>  	struct vsock_sock *vsk = vsock_sk(sk);
> -	struct virtio_vsock_sock *vvs = vsk->trans;
>  	int err = 0;
>  
>  	switch (le16_to_cpu(pkt->hdr.op)) {
>  	case VIRTIO_VSOCK_OP_RW:
> -		pkt->len = le32_to_cpu(pkt->hdr.len);
> -		pkt->off = 0;
> -
> -		spin_lock_bh(&vvs->rx_lock);
> -		virtio_transport_inc_rx_pkt(vvs, pkt);
> -		list_add_tail(&pkt->list, &vvs->rx_queue);
> -		spin_unlock_bh(&vvs->rx_lock);
> -
> +		virtio_transport_recv_enqueue(vsk, pkt);
>  		sk->sk_data_ready(sk);
>  		return err;
>  	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
> -- 
> 2.20.1
Stefano Garzarella July 29, 2019, 3:36 p.m. UTC | #2
On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > Since virtio-vsock was introduced, the buffers filled by the host
> > and pushed to the guest using the vring, are directly queued in
> > a per-socket list. These buffers are preallocated by the guest
> > with a fixed size (4 KB).
> > 
> > The maximum amount of memory used by each socket should be
> > controlled by the credit mechanism.
> > The default credit available per-socket is 256 KB, but if we use
> > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > buffers, using up to 1 GB of memory per-socket. In addition, the
> > guest will continue to fill the vring with new 4 KB free buffers
> > to avoid starvation of other sockets.
> > 
> > This patch mitigates this issue copying the payload of small
> > packets (< 128 bytes) into the buffer of last packet queued, in
> > order to avoid wasting memory.
> > 
> > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> 
> This is good enough for net-next, but for net I think we
> should figure out how to address the issue completely.
> Can we make the accounting precise? What happens to
> performance if we do?
> 

In order to do more precise accounting maybe we can use the buffer size,
instead of payload size when we update the credit available.
In this way, the credit available for each socket will reflect the memory
actually used.

I should check better, because I'm not sure what happen if the peer sees
1KB of space available, then it sends 1KB of payload (using a 4KB
buffer).

The other option is to copy each packet in a new buffer like I did in
the v2 [2], but this forces us to make a copy for each packet that does
not fill the entire buffer, perhaps too expensive.

[2] https://patchwork.kernel.org/patch/10938741/


Thanks,
Stefano
Michael S. Tsirkin July 29, 2019, 3:49 p.m. UTC | #3
On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > Since virtio-vsock was introduced, the buffers filled by the host
> > > and pushed to the guest using the vring, are directly queued in
> > > a per-socket list. These buffers are preallocated by the guest
> > > with a fixed size (4 KB).
> > > 
> > > The maximum amount of memory used by each socket should be
> > > controlled by the credit mechanism.
> > > The default credit available per-socket is 256 KB, but if we use
> > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > guest will continue to fill the vring with new 4 KB free buffers
> > > to avoid starvation of other sockets.
> > > 
> > > This patch mitigates this issue copying the payload of small
> > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > order to avoid wasting memory.
> > > 
> > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > 
> > This is good enough for net-next, but for net I think we
> > should figure out how to address the issue completely.
> > Can we make the accounting precise? What happens to
> > performance if we do?
> > 
> 
> In order to do more precise accounting maybe we can use the buffer size,
> instead of payload size when we update the credit available.
> In this way, the credit available for each socket will reflect the memory
> actually used.
> 
> I should check better, because I'm not sure what happen if the peer sees
> 1KB of space available, then it sends 1KB of payload (using a 4KB
> buffer).
> 
> The other option is to copy each packet in a new buffer like I did in
> the v2 [2], but this forces us to make a copy for each packet that does
> not fill the entire buffer, perhaps too expensive.
> 
> [2] https://patchwork.kernel.org/patch/10938741/
> 
> 
> Thanks,
> Stefano

Interesting. You are right, and at some level the protocol forces copies.

We could try to detect that the actual memory is getting close to
admin limits and force copies on queued packets after the fact.
Is that practical?

And yes we can extend the credit accounting to include buffer size.
That's a protocol change but maybe it makes sense.
Michael S. Tsirkin July 29, 2019, 4:01 p.m. UTC | #4
On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > Since virtio-vsock was introduced, the buffers filled by the host
> > > and pushed to the guest using the vring, are directly queued in
> > > a per-socket list. These buffers are preallocated by the guest
> > > with a fixed size (4 KB).
> > > 
> > > The maximum amount of memory used by each socket should be
> > > controlled by the credit mechanism.
> > > The default credit available per-socket is 256 KB, but if we use
> > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > guest will continue to fill the vring with new 4 KB free buffers
> > > to avoid starvation of other sockets.
> > > 
> > > This patch mitigates this issue copying the payload of small
> > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > order to avoid wasting memory.
> > > 
> > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > 
> > This is good enough for net-next, but for net I think we
> > should figure out how to address the issue completely.
> > Can we make the accounting precise? What happens to
> > performance if we do?
> > 
> 
> In order to do more precise accounting maybe we can use the buffer size,
> instead of payload size when we update the credit available.
> In this way, the credit available for each socket will reflect the memory
> actually used.
> 
> I should check better, because I'm not sure what happen if the peer sees
> 1KB of space available, then it sends 1KB of payload (using a 4KB
> buffer).
> The other option is to copy each packet in a new buffer like I did in
> the v2 [2], but this forces us to make a copy for each packet that does
> not fill the entire buffer, perhaps too expensive.
> 
> [2] https://patchwork.kernel.org/patch/10938741/
> 

So one thing we can easily do is to under-report the
available credit. E.g. if we copy up to 256bytes,
then report just 256bytes for every buffer in the queue.


> 
> Thanks,
> Stefano
Stefano Garzarella July 29, 2019, 4:19 p.m. UTC | #5
On Mon, Jul 29, 2019 at 11:49:02AM -0400, Michael S. Tsirkin wrote:
> On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > and pushed to the guest using the vring, are directly queued in
> > > > a per-socket list. These buffers are preallocated by the guest
> > > > with a fixed size (4 KB).
> > > > 
> > > > The maximum amount of memory used by each socket should be
> > > > controlled by the credit mechanism.
> > > > The default credit available per-socket is 256 KB, but if we use
> > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > to avoid starvation of other sockets.
> > > > 
> > > > This patch mitigates this issue copying the payload of small
> > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > order to avoid wasting memory.
> > > > 
> > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > 
> > > This is good enough for net-next, but for net I think we
> > > should figure out how to address the issue completely.
> > > Can we make the accounting precise? What happens to
> > > performance if we do?
> > > 
> > 
> > In order to do more precise accounting maybe we can use the buffer size,
> > instead of payload size when we update the credit available.
> > In this way, the credit available for each socket will reflect the memory
> > actually used.
> > 
> > I should check better, because I'm not sure what happen if the peer sees
> > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > buffer).
> > 
> > The other option is to copy each packet in a new buffer like I did in
> > the v2 [2], but this forces us to make a copy for each packet that does
> > not fill the entire buffer, perhaps too expensive.
> > 
> > [2] https://patchwork.kernel.org/patch/10938741/
> > 
> > 
> > Thanks,
> > Stefano
> 
> Interesting. You are right, and at some level the protocol forces copies.
> 
> We could try to detect that the actual memory is getting close to
> admin limits and force copies on queued packets after the fact.
> Is that practical?

Yes, I think it is doable!
We can decrease the credit available with the buffer size queued, and
when the buffer size of packet to queue is bigger than the credit
available, we can copy it.

> 
> And yes we can extend the credit accounting to include buffer size.
> That's a protocol change but maybe it makes sense.

Since we send to the other peer the credit available, maybe this
change can be backwards compatible (I'll check better this).

Thanks,
Stefano
Stefano Garzarella July 29, 2019, 4:41 p.m. UTC | #6
On Mon, Jul 29, 2019 at 12:01:37PM -0400, Michael S. Tsirkin wrote:
> On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > and pushed to the guest using the vring, are directly queued in
> > > > a per-socket list. These buffers are preallocated by the guest
> > > > with a fixed size (4 KB).
> > > >
> > > > The maximum amount of memory used by each socket should be
> > > > controlled by the credit mechanism.
> > > > The default credit available per-socket is 256 KB, but if we use
> > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > to avoid starvation of other sockets.
> > > >
> > > > This patch mitigates this issue copying the payload of small
> > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > order to avoid wasting memory.
> > > >
> > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > >
> > > This is good enough for net-next, but for net I think we
> > > should figure out how to address the issue completely.
> > > Can we make the accounting precise? What happens to
> > > performance if we do?
> > >
> >
> > In order to do more precise accounting maybe we can use the buffer size,
> > instead of payload size when we update the credit available.
> > In this way, the credit available for each socket will reflect the memory
> > actually used.
> >
> > I should check better, because I'm not sure what happen if the peer sees
> > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > buffer).
> > The other option is to copy each packet in a new buffer like I did in
> > the v2 [2], but this forces us to make a copy for each packet that does
> > not fill the entire buffer, perhaps too expensive.
> >
> > [2] https://patchwork.kernel.org/patch/10938741/
> >
>
> So one thing we can easily do is to under-report the
> available credit. E.g. if we copy up to 256bytes,
> then report just 256bytes for every buffer in the queue.
>

Ehm sorry, I got lost :(
Can you explain better?


Thanks,
Stefano
Stefano Garzarella July 29, 2019, 4:50 p.m. UTC | #7
On Mon, Jul 29, 2019 at 06:19:03PM +0200, Stefano Garzarella wrote:
> On Mon, Jul 29, 2019 at 11:49:02AM -0400, Michael S. Tsirkin wrote:
> > On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > and pushed to the guest using the vring, are directly queued in
> > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > with a fixed size (4 KB).
> > > > > 
> > > > > The maximum amount of memory used by each socket should be
> > > > > controlled by the credit mechanism.
> > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > to avoid starvation of other sockets.
> > > > > 
> > > > > This patch mitigates this issue copying the payload of small
> > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > order to avoid wasting memory.
> > > > > 
> > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > 
> > > > This is good enough for net-next, but for net I think we
> > > > should figure out how to address the issue completely.
> > > > Can we make the accounting precise? What happens to
> > > > performance if we do?
> > > > 
> > > 
> > > In order to do more precise accounting maybe we can use the buffer size,
> > > instead of payload size when we update the credit available.
> > > In this way, the credit available for each socket will reflect the memory
> > > actually used.
> > > 
> > > I should check better, because I'm not sure what happen if the peer sees
> > > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > > buffer).
> > > 
> > > The other option is to copy each packet in a new buffer like I did in
> > > the v2 [2], but this forces us to make a copy for each packet that does
> > > not fill the entire buffer, perhaps too expensive.
> > > 
> > > [2] https://patchwork.kernel.org/patch/10938741/
> > > 
> > > 
> > > Thanks,
> > > Stefano
> > 
> > Interesting. You are right, and at some level the protocol forces copies.
> > 
> > We could try to detect that the actual memory is getting close to
> > admin limits and force copies on queued packets after the fact.
> > Is that practical?
> 
> Yes, I think it is doable!
> We can decrease the credit available with the buffer size queued, and
> when the buffer size of packet to queue is bigger than the credit
> available, we can copy it.
> 
> > 
> > And yes we can extend the credit accounting to include buffer size.
> > That's a protocol change but maybe it makes sense.
> 
> Since we send to the other peer the credit available, maybe this
> change can be backwards compatible (I'll check better this).

What I said was wrong.

We send a counter (increased when the user consumes the packets) and the
"buf_alloc" (the max memory allowed) to the other peer.
It makes a difference between a local counter (increased when the
packets are sent) and the remote counter to calculate the credit available:

    u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
    {
    	u32 ret;

    	spin_lock_bh(&vvs->tx_lock);
    	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
    	if (ret > credit)
    		ret = credit;
    	vvs->tx_cnt += ret;
    	spin_unlock_bh(&vvs->tx_lock);

    	return ret;
    }

Maybe I can play with "buf_alloc" to take care of bytes queued but not
used.

Thanks,
Stefano
Michael S. Tsirkin July 29, 2019, 7:10 p.m. UTC | #8
On Mon, Jul 29, 2019 at 06:50:56PM +0200, Stefano Garzarella wrote:
> On Mon, Jul 29, 2019 at 06:19:03PM +0200, Stefano Garzarella wrote:
> > On Mon, Jul 29, 2019 at 11:49:02AM -0400, Michael S. Tsirkin wrote:
> > > On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > > and pushed to the guest using the vring, are directly queued in
> > > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > > with a fixed size (4 KB).
> > > > > > 
> > > > > > The maximum amount of memory used by each socket should be
> > > > > > controlled by the credit mechanism.
> > > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > > to avoid starvation of other sockets.
> > > > > > 
> > > > > > This patch mitigates this issue copying the payload of small
> > > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > > order to avoid wasting memory.
> > > > > > 
> > > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > > 
> > > > > This is good enough for net-next, but for net I think we
> > > > > should figure out how to address the issue completely.
> > > > > Can we make the accounting precise? What happens to
> > > > > performance if we do?
> > > > > 
> > > > 
> > > > In order to do more precise accounting maybe we can use the buffer size,
> > > > instead of payload size when we update the credit available.
> > > > In this way, the credit available for each socket will reflect the memory
> > > > actually used.
> > > > 
> > > > I should check better, because I'm not sure what happen if the peer sees
> > > > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > > > buffer).
> > > > 
> > > > The other option is to copy each packet in a new buffer like I did in
> > > > the v2 [2], but this forces us to make a copy for each packet that does
> > > > not fill the entire buffer, perhaps too expensive.
> > > > 
> > > > [2] https://patchwork.kernel.org/patch/10938741/
> > > > 
> > > > 
> > > > Thanks,
> > > > Stefano
> > > 
> > > Interesting. You are right, and at some level the protocol forces copies.
> > > 
> > > We could try to detect that the actual memory is getting close to
> > > admin limits and force copies on queued packets after the fact.
> > > Is that practical?
> > 
> > Yes, I think it is doable!
> > We can decrease the credit available with the buffer size queued, and
> > when the buffer size of packet to queue is bigger than the credit
> > available, we can copy it.
> > 
> > > 
> > > And yes we can extend the credit accounting to include buffer size.
> > > That's a protocol change but maybe it makes sense.
> > 
> > Since we send to the other peer the credit available, maybe this
> > change can be backwards compatible (I'll check better this).
> 
> What I said was wrong.
> 
> We send a counter (increased when the user consumes the packets) and the
> "buf_alloc" (the max memory allowed) to the other peer.
> It makes a difference between a local counter (increased when the
> packets are sent) and the remote counter to calculate the credit available:
> 
>     u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
>     {
>     	u32 ret;
> 
>     	spin_lock_bh(&vvs->tx_lock);
>     	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
>     	if (ret > credit)
>     		ret = credit;
>     	vvs->tx_cnt += ret;
>     	spin_unlock_bh(&vvs->tx_lock);
> 
>     	return ret;
>     }
> 
> Maybe I can play with "buf_alloc" to take care of bytes queued but not
> used.
> 
> Thanks,
> Stefano

Right. And the idea behind it all was that if we send a credit
to remote then we have space for it.
I think the basic idea was that if we have actual allocated
memory and can copy data there, then we send the credit to
remote.

Of course that means an extra copy every packet.
So as an optimization, it seems that we just assume
that we will be able to allocate a new buffer.

First this is not the best we can do. We can actually do
allocate memory in the socket before sending credit.
If packet is small then we copy it there.
If packet is big then we queue the packet,
take the buffer out of socket and add it to the virtqueue.

Second question is what to do about medium sized packets.
Packet is 1K but buffer is 4K, what do we do?
And here I wonder - why don't we add the 3K buffer
to the vq?
Michael S. Tsirkin July 29, 2019, 7:33 p.m. UTC | #9
On Mon, Jul 29, 2019 at 06:41:27PM +0200, Stefano Garzarella wrote:
> On Mon, Jul 29, 2019 at 12:01:37PM -0400, Michael S. Tsirkin wrote:
> > On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > and pushed to the guest using the vring, are directly queued in
> > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > with a fixed size (4 KB).
> > > > >
> > > > > The maximum amount of memory used by each socket should be
> > > > > controlled by the credit mechanism.
> > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > to avoid starvation of other sockets.
> > > > >
> > > > > This patch mitigates this issue copying the payload of small
> > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > order to avoid wasting memory.
> > > > >
> > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > >
> > > > This is good enough for net-next, but for net I think we
> > > > should figure out how to address the issue completely.
> > > > Can we make the accounting precise? What happens to
> > > > performance if we do?
> > > >
> > >
> > > In order to do more precise accounting maybe we can use the buffer size,
> > > instead of payload size when we update the credit available.
> > > In this way, the credit available for each socket will reflect the memory
> > > actually used.
> > >
> > > I should check better, because I'm not sure what happen if the peer sees
> > > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > > buffer).
> > > The other option is to copy each packet in a new buffer like I did in
> > > the v2 [2], but this forces us to make a copy for each packet that does
> > > not fill the entire buffer, perhaps too expensive.
> > >
> > > [2] https://patchwork.kernel.org/patch/10938741/
> > >
> >
> > So one thing we can easily do is to under-report the
> > available credit. E.g. if we copy up to 256bytes,
> > then report just 256bytes for every buffer in the queue.
> >
> 
> Ehm sorry, I got lost :(
> Can you explain better?
> 
> 
> Thanks,
> Stefano

I think I suggested a better idea more recently.
But to clarify this option: we are adding a 4K buffer.
Let's say we know we will always copy 128 bytes.

So we just tell remote we have 128.
If we add another 4K buffer we add another 128 credits.

So we are charging local socket 16x more (4k for a 128 byte packet) but
we are paying remote 16x less (128 credits for 4k byte buffer). It evens
out.

Way less credits to go around so I'm not sure it's a good idea,
at least as the only solution. Can be combined with other
optimizations and probably in a less drastic fashion
(e.g. 2x rather than 16x).
Stefano Garzarella July 30, 2019, 9:35 a.m. UTC | #10
On Mon, Jul 29, 2019 at 03:10:15PM -0400, Michael S. Tsirkin wrote:
> On Mon, Jul 29, 2019 at 06:50:56PM +0200, Stefano Garzarella wrote:
> > On Mon, Jul 29, 2019 at 06:19:03PM +0200, Stefano Garzarella wrote:
> > > On Mon, Jul 29, 2019 at 11:49:02AM -0400, Michael S. Tsirkin wrote:
> > > > On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > > > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > > > and pushed to the guest using the vring, are directly queued in
> > > > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > > > with a fixed size (4 KB).
> > > > > > > 
> > > > > > > The maximum amount of memory used by each socket should be
> > > > > > > controlled by the credit mechanism.
> > > > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > > > to avoid starvation of other sockets.
> > > > > > > 
> > > > > > > This patch mitigates this issue copying the payload of small
> > > > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > > > order to avoid wasting memory.
> > > > > > > 
> > > > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > > > 
> > > > > > This is good enough for net-next, but for net I think we
> > > > > > should figure out how to address the issue completely.
> > > > > > Can we make the accounting precise? What happens to
> > > > > > performance if we do?
> > > > > > 
> > > > > 
> > > > > In order to do more precise accounting maybe we can use the buffer size,
> > > > > instead of payload size when we update the credit available.
> > > > > In this way, the credit available for each socket will reflect the memory
> > > > > actually used.
> > > > > 
> > > > > I should check better, because I'm not sure what happen if the peer sees
> > > > > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > > > > buffer).
> > > > > 
> > > > > The other option is to copy each packet in a new buffer like I did in
> > > > > the v2 [2], but this forces us to make a copy for each packet that does
> > > > > not fill the entire buffer, perhaps too expensive.
> > > > > 
> > > > > [2] https://patchwork.kernel.org/patch/10938741/
> > > > > 
> > > > > 
> > > > > Thanks,
> > > > > Stefano
> > > > 
> > > > Interesting. You are right, and at some level the protocol forces copies.
> > > > 
> > > > We could try to detect that the actual memory is getting close to
> > > > admin limits and force copies on queued packets after the fact.
> > > > Is that practical?
> > > 
> > > Yes, I think it is doable!
> > > We can decrease the credit available with the buffer size queued, and
> > > when the buffer size of packet to queue is bigger than the credit
> > > available, we can copy it.
> > > 
> > > > 
> > > > And yes we can extend the credit accounting to include buffer size.
> > > > That's a protocol change but maybe it makes sense.
> > > 
> > > Since we send to the other peer the credit available, maybe this
> > > change can be backwards compatible (I'll check better this).
> > 
> > What I said was wrong.
> > 
> > We send a counter (increased when the user consumes the packets) and the
> > "buf_alloc" (the max memory allowed) to the other peer.
> > It makes a difference between a local counter (increased when the
> > packets are sent) and the remote counter to calculate the credit available:
> > 
> >     u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
> >     {
> >     	u32 ret;
> > 
> >     	spin_lock_bh(&vvs->tx_lock);
> >     	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
> >     	if (ret > credit)
> >     		ret = credit;
> >     	vvs->tx_cnt += ret;
> >     	spin_unlock_bh(&vvs->tx_lock);
> > 
> >     	return ret;
> >     }
> > 
> > Maybe I can play with "buf_alloc" to take care of bytes queued but not
> > used.
> > 
> > Thanks,
> > Stefano
> 
> Right. And the idea behind it all was that if we send a credit
> to remote then we have space for it.

Yes.

> I think the basic idea was that if we have actual allocated
> memory and can copy data there, then we send the credit to
> remote.
> 
> Of course that means an extra copy every packet.
> So as an optimization, it seems that we just assume
> that we will be able to allocate a new buffer.

Yes, we refill the virtqueue when half of the buffers were used.

> 
> First this is not the best we can do. We can actually do
> allocate memory in the socket before sending credit.

In this case, IIUC we should allocate an entire buffer (4KB),
so we can reuse it if the packet is big.

> If packet is small then we copy it there.
> If packet is big then we queue the packet,
> take the buffer out of socket and add it to the virtqueue.
> 
> Second question is what to do about medium sized packets.
> Packet is 1K but buffer is 4K, what do we do?
> And here I wonder - why don't we add the 3K buffer
> to the vq?

This would allow us to have an accurate credit account.

The problem here is the compatibility. Before this series virtio-vsock
and vhost-vsock modules had the RX buffer size hard-coded
(VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
of 4K, there might be issues.

Maybe it is the time to add add 'features' to virtio-vsock device.

Thanks,
Stefano
Michael S. Tsirkin July 30, 2019, 8:42 p.m. UTC | #11
On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> On Mon, Jul 29, 2019 at 03:10:15PM -0400, Michael S. Tsirkin wrote:
> > On Mon, Jul 29, 2019 at 06:50:56PM +0200, Stefano Garzarella wrote:
> > > On Mon, Jul 29, 2019 at 06:19:03PM +0200, Stefano Garzarella wrote:
> > > > On Mon, Jul 29, 2019 at 11:49:02AM -0400, Michael S. Tsirkin wrote:
> > > > > On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > > > > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > > > > and pushed to the guest using the vring, are directly queued in
> > > > > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > > > > with a fixed size (4 KB).
> > > > > > > > 
> > > > > > > > The maximum amount of memory used by each socket should be
> > > > > > > > controlled by the credit mechanism.
> > > > > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > > > > to avoid starvation of other sockets.
> > > > > > > > 
> > > > > > > > This patch mitigates this issue copying the payload of small
> > > > > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > > > > order to avoid wasting memory.
> > > > > > > > 
> > > > > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > > > > 
> > > > > > > This is good enough for net-next, but for net I think we
> > > > > > > should figure out how to address the issue completely.
> > > > > > > Can we make the accounting precise? What happens to
> > > > > > > performance if we do?
> > > > > > > 
> > > > > > 
> > > > > > In order to do more precise accounting maybe we can use the buffer size,
> > > > > > instead of payload size when we update the credit available.
> > > > > > In this way, the credit available for each socket will reflect the memory
> > > > > > actually used.
> > > > > > 
> > > > > > I should check better, because I'm not sure what happen if the peer sees
> > > > > > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > > > > > buffer).
> > > > > > 
> > > > > > The other option is to copy each packet in a new buffer like I did in
> > > > > > the v2 [2], but this forces us to make a copy for each packet that does
> > > > > > not fill the entire buffer, perhaps too expensive.
> > > > > > 
> > > > > > [2] https://patchwork.kernel.org/patch/10938741/
> > > > > > 
> > > > > > 
> > > > > > Thanks,
> > > > > > Stefano
> > > > > 
> > > > > Interesting. You are right, and at some level the protocol forces copies.
> > > > > 
> > > > > We could try to detect that the actual memory is getting close to
> > > > > admin limits and force copies on queued packets after the fact.
> > > > > Is that practical?
> > > > 
> > > > Yes, I think it is doable!
> > > > We can decrease the credit available with the buffer size queued, and
> > > > when the buffer size of packet to queue is bigger than the credit
> > > > available, we can copy it.
> > > > 
> > > > > 
> > > > > And yes we can extend the credit accounting to include buffer size.
> > > > > That's a protocol change but maybe it makes sense.
> > > > 
> > > > Since we send to the other peer the credit available, maybe this
> > > > change can be backwards compatible (I'll check better this).
> > > 
> > > What I said was wrong.
> > > 
> > > We send a counter (increased when the user consumes the packets) and the
> > > "buf_alloc" (the max memory allowed) to the other peer.
> > > It makes a difference between a local counter (increased when the
> > > packets are sent) and the remote counter to calculate the credit available:
> > > 
> > >     u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
> > >     {
> > >     	u32 ret;
> > > 
> > >     	spin_lock_bh(&vvs->tx_lock);
> > >     	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
> > >     	if (ret > credit)
> > >     		ret = credit;
> > >     	vvs->tx_cnt += ret;
> > >     	spin_unlock_bh(&vvs->tx_lock);
> > > 
> > >     	return ret;
> > >     }
> > > 
> > > Maybe I can play with "buf_alloc" to take care of bytes queued but not
> > > used.
> > > 
> > > Thanks,
> > > Stefano
> > 
> > Right. And the idea behind it all was that if we send a credit
> > to remote then we have space for it.
> 
> Yes.
> 
> > I think the basic idea was that if we have actual allocated
> > memory and can copy data there, then we send the credit to
> > remote.
> > 
> > Of course that means an extra copy every packet.
> > So as an optimization, it seems that we just assume
> > that we will be able to allocate a new buffer.
> 
> Yes, we refill the virtqueue when half of the buffers were used.
> 
> > 
> > First this is not the best we can do. We can actually do
> > allocate memory in the socket before sending credit.
> 
> In this case, IIUC we should allocate an entire buffer (4KB),
> so we can reuse it if the packet is big.
> 
> > If packet is small then we copy it there.
> > If packet is big then we queue the packet,
> > take the buffer out of socket and add it to the virtqueue.
> > 
> > Second question is what to do about medium sized packets.
> > Packet is 1K but buffer is 4K, what do we do?
> > And here I wonder - why don't we add the 3K buffer
> > to the vq?
> 
> This would allow us to have an accurate credit account.
> 
> The problem here is the compatibility. Before this series virtio-vsock
> and vhost-vsock modules had the RX buffer size hard-coded
> (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> of 4K, there might be issues.

Shouldn't be if they are following the spec. If not let's fix
the broken parts.

> 
> Maybe it is the time to add add 'features' to virtio-vsock device.
> 
> Thanks,
> Stefano

Why would a remote care about buffer sizes?

Let's first see what the issues are. If they exist
we can either fix the bugs, or code the bug as a feature in spec.
Stefano Garzarella Aug. 1, 2019, 10:47 a.m. UTC | #12
On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:

(...)

> > 
> > The problem here is the compatibility. Before this series virtio-vsock
> > and vhost-vsock modules had the RX buffer size hard-coded
> > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > of 4K, there might be issues.
> 
> Shouldn't be if they are following the spec. If not let's fix
> the broken parts.
> 
> > 
> > Maybe it is the time to add add 'features' to virtio-vsock device.
> > 
> > Thanks,
> > Stefano
> 
> Why would a remote care about buffer sizes?
> 
> Let's first see what the issues are. If they exist
> we can either fix the bugs, or code the bug as a feature in spec.
> 

The vhost_transport '.stream_enqueue' callback
[virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
passing the user message. This function allocates a new packet, copying
the user message, but (before this series) it limits the packet size to
the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):

static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
					  struct virtio_vsock_pkt_info *info)
{
 ...
	/* we can send less than pkt_len bytes */
	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;

	/* virtio_transport_get_credit might return less than pkt_len credit */
	pkt_len = virtio_transport_get_credit(vvs, pkt_len);

	/* Do not send zero length OP_RW pkt */
	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
		return pkt_len;
 ...
}

then it queues the packet for the TX worker calling .send_pkt()
[vhost_transport_send_pkt() in the vhost_transport case]

The main function executed by the TX worker is
vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
and it tries to copy the packet (up to 4K) on it.  If the buffer
allocated from the guest will be smaller then 4K, I think here it will
be discarded with an error:

static void
vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
				struct vhost_virtqueue *vq)
{
 ...
		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
		if (nbytes != pkt->len) {
			virtio_transport_free_pkt(pkt);
			vq_err(vq, "Faulted on copying pkt buf\n");
			break;
		}
 ...
}


This series changes this behavior since now we will split the packet in
vhost_transport_do_send_pkt() depending on the buffer found in the
virtqueue.

We didn't change the buffer size in this series, so we still backward
compatible, but if we will use buffers smaller than 4K, we should
encounter the error described above.

How do you suggest we proceed if we want to change the buffer size?
Maybe adding a feature to "support any buffer size"?

Thanks,
Stefano
Michael S. Tsirkin Aug. 1, 2019, 1:21 p.m. UTC | #13
On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> 
> (...)
> 
> > > 
> > > The problem here is the compatibility. Before this series virtio-vsock
> > > and vhost-vsock modules had the RX buffer size hard-coded
> > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > of 4K, there might be issues.
> > 
> > Shouldn't be if they are following the spec. If not let's fix
> > the broken parts.
> > 
> > > 
> > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > 
> > > Thanks,
> > > Stefano
> > 
> > Why would a remote care about buffer sizes?
> > 
> > Let's first see what the issues are. If they exist
> > we can either fix the bugs, or code the bug as a feature in spec.
> > 
> 
> The vhost_transport '.stream_enqueue' callback
> [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> passing the user message. This function allocates a new packet, copying
> the user message, but (before this series) it limits the packet size to
> the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> 
> static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> 					  struct virtio_vsock_pkt_info *info)
> {
>  ...
> 	/* we can send less than pkt_len bytes */
> 	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> 		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> 
> 	/* virtio_transport_get_credit might return less than pkt_len credit */
> 	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> 
> 	/* Do not send zero length OP_RW pkt */
> 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> 		return pkt_len;
>  ...
> }
> 
> then it queues the packet for the TX worker calling .send_pkt()
> [vhost_transport_send_pkt() in the vhost_transport case]
> 
> The main function executed by the TX worker is
> vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> and it tries to copy the packet (up to 4K) on it.  If the buffer
> allocated from the guest will be smaller then 4K, I think here it will
> be discarded with an error:
> 
> static void
> vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> 				struct vhost_virtqueue *vq)
> {
>  ...
> 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);

isn't pck len the actual length though?

> 		if (nbytes != pkt->len) {
> 			virtio_transport_free_pkt(pkt);
> 			vq_err(vq, "Faulted on copying pkt buf\n");
> 			break;
> 		}
>  ...
> }
> 
> 
> This series changes this behavior since now we will split the packet in
> vhost_transport_do_send_pkt() depending on the buffer found in the
> virtqueue.
> 
> We didn't change the buffer size in this series, so we still backward
> compatible, but if we will use buffers smaller than 4K, we should
> encounter the error described above.
> 
> How do you suggest we proceed if we want to change the buffer size?
> Maybe adding a feature to "support any buffer size"?
> 
> Thanks,
> Stefano
Stefano Garzarella Aug. 1, 2019, 1:36 p.m. UTC | #14
On Thu, Aug 01, 2019 at 09:21:15AM -0400, Michael S. Tsirkin wrote:
> On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> > On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> > 
> > (...)
> > 
> > > > 
> > > > The problem here is the compatibility. Before this series virtio-vsock
> > > > and vhost-vsock modules had the RX buffer size hard-coded
> > > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > > of 4K, there might be issues.
> > > 
> > > Shouldn't be if they are following the spec. If not let's fix
> > > the broken parts.
> > > 
> > > > 
> > > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > > 
> > > > Thanks,
> > > > Stefano
> > > 
> > > Why would a remote care about buffer sizes?
> > > 
> > > Let's first see what the issues are. If they exist
> > > we can either fix the bugs, or code the bug as a feature in spec.
> > > 
> > 
> > The vhost_transport '.stream_enqueue' callback
> > [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> > passing the user message. This function allocates a new packet, copying
> > the user message, but (before this series) it limits the packet size to
> > the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> > 
> > static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> > 					  struct virtio_vsock_pkt_info *info)
> > {
> >  ...
> > 	/* we can send less than pkt_len bytes */
> > 	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> > 		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> > 
> > 	/* virtio_transport_get_credit might return less than pkt_len credit */
> > 	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> > 
> > 	/* Do not send zero length OP_RW pkt */
> > 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> > 		return pkt_len;
> >  ...
> > }
> > 
> > then it queues the packet for the TX worker calling .send_pkt()
> > [vhost_transport_send_pkt() in the vhost_transport case]
> > 
> > The main function executed by the TX worker is
> > vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> > and it tries to copy the packet (up to 4K) on it.  If the buffer
> > allocated from the guest will be smaller then 4K, I think here it will
> > be discarded with an error:
> > 

I'm adding more lines to explain better.

> > static void
> > vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> > 				struct vhost_virtqueue *vq)
> > {
		...

		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
					 &out, &in, NULL, NULL);

		...

		len = iov_length(&vq->iov[out], in);
		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);

		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
		if (nbytes != sizeof(pkt->hdr)) {
			virtio_transport_free_pkt(pkt);
			vq_err(vq, "Faulted on copying pkt hdr\n");
			break;
		}

> >  ...
> > 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> 
> isn't pck len the actual length though?
> 

It is the length of the packet that we are copying in the guest RX
buffers pointed by the iov_iter. The guest allocates an iovec with 2
buffers, one for the header and one for the payload (4KB).

> > 		if (nbytes != pkt->len) {
> > 			virtio_transport_free_pkt(pkt);
> > 			vq_err(vq, "Faulted on copying pkt buf\n");
> > 			break;
> > 		}
> >  ...
> > }
> > 
> > 
> > This series changes this behavior since now we will split the packet in
> > vhost_transport_do_send_pkt() depending on the buffer found in the
> > virtqueue.
> > 
> > We didn't change the buffer size in this series, so we still backward
> > compatible, but if we will use buffers smaller than 4K, we should
> > encounter the error described above.
> > 
> > How do you suggest we proceed if we want to change the buffer size?
> > Maybe adding a feature to "support any buffer size"?
> > 
> > Thanks,
> > Stefano
> 
> 

--
Stefano Garzarella Aug. 30, 2019, 9:40 a.m. UTC | #15
On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > Since virtio-vsock was introduced, the buffers filled by the host
> > and pushed to the guest using the vring, are directly queued in
> > a per-socket list. These buffers are preallocated by the guest
> > with a fixed size (4 KB).
> > 
> > The maximum amount of memory used by each socket should be
> > controlled by the credit mechanism.
> > The default credit available per-socket is 256 KB, but if we use
> > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > buffers, using up to 1 GB of memory per-socket. In addition, the
> > guest will continue to fill the vring with new 4 KB free buffers
> > to avoid starvation of other sockets.
> > 
> > This patch mitigates this issue copying the payload of small
> > packets (< 128 bytes) into the buffer of last packet queued, in
> > order to avoid wasting memory.
> > 
> > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> 
> This is good enough for net-next, but for net I think we
> should figure out how to address the issue completely.
> Can we make the accounting precise? What happens to
> performance if we do?
> 

Since I'm back from holidays, I'm restarting this thread to figure out
how to address the issue completely.

I did a better analysis of the credit mechanism that we implemented in
virtio-vsock to get a clearer view and I'd share it with you:

    This issue affect only the "host->guest" path. In this case, when the
    host wants to send a packet to the guest, it uses a "free" buffer
    allocated by the guest (4KB).
    The "free" buffers available for the host are shared between all
    sockets, instead, the credit mechanism is per-socket, I think to
    avoid the starvation of others sockets.
    The guests re-fill the "free" queue when the available buffers are
    less than half.

    Each peer have these variables in the per-socket state:
       /* local vars */
       buf_alloc        /* max bytes usable by this socket
                           [exposed to the other peer] */
       fwd_cnt          /* increased when RX packet is consumed by the
                           user space [exposed to the other peer] */
       tx_cnt 	        /* increased when TX packet is sent to the other peer */

       /* remote vars  */
       peer_buf_alloc   /* peer's buf_alloc */
       peer_fwd_cnt     /* peer's fwd_cnt */

    When a peer sends a packet, it increases the 'tx_cnt'; when the
    receiver consumes the packet (copy it to the user-space buffer), it
    increases the 'fwd_cnt'.
    Note: increments are made considering the payload length and not the
    buffer length.

    The value of 'buf_alloc' and 'fwd_cnt' are sent to the other peer in
    all packet headers or with an explicit CREDIT_UPDATE packet.

    The local 'buf_alloc' value can be modified by the user space using
    setsockopt() with optname=SO_VM_SOCKETS_BUFFER_SIZE.

    Before to send a packet, the peer checks the space available:
    	credit_available = peer_buf_alloc - (tx_cnt - peer_fwd_cnt)
    and it will send up to credit_available bytes to the other peer.

Possible solutions considering Michael's advice:
1. Use the buffer length instead of the payload length when we increment
   the counters:
  - This approach will account precisely the memory used per socket.
  - This requires changes in both guest and host.
  - It is not compatible with old drivers, so a feature should be negotiated.

2. Decrease the advertised 'buf_alloc' taking count of bytes queued in
   the socket queue but not used. (e.g. 256 byte used on 4K available in
   the buffer)
  - pkt->hdr.buf_alloc = buf_alloc - bytes_not_used.
  - This should be compatible also with old drivers.

Maybe the second is less invasive, but will it be too tricky?
Any other advice or suggestions?

Thanks in advance,
Stefano
Michael S. Tsirkin Sept. 1, 2019, 6:56 a.m. UTC | #16
On Fri, Aug 30, 2019 at 11:40:59AM +0200, Stefano Garzarella wrote:
> On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > Since virtio-vsock was introduced, the buffers filled by the host
> > > and pushed to the guest using the vring, are directly queued in
> > > a per-socket list. These buffers are preallocated by the guest
> > > with a fixed size (4 KB).
> > > 
> > > The maximum amount of memory used by each socket should be
> > > controlled by the credit mechanism.
> > > The default credit available per-socket is 256 KB, but if we use
> > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > guest will continue to fill the vring with new 4 KB free buffers
> > > to avoid starvation of other sockets.
> > > 
> > > This patch mitigates this issue copying the payload of small
> > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > order to avoid wasting memory.
> > > 
> > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > 
> > This is good enough for net-next, but for net I think we
> > should figure out how to address the issue completely.
> > Can we make the accounting precise? What happens to
> > performance if we do?
> > 
> 
> Since I'm back from holidays, I'm restarting this thread to figure out
> how to address the issue completely.
> 
> I did a better analysis of the credit mechanism that we implemented in
> virtio-vsock to get a clearer view and I'd share it with you:
> 
>     This issue affect only the "host->guest" path. In this case, when the
>     host wants to send a packet to the guest, it uses a "free" buffer
>     allocated by the guest (4KB).
>     The "free" buffers available for the host are shared between all
>     sockets, instead, the credit mechanism is per-socket, I think to
>     avoid the starvation of others sockets.
>     The guests re-fill the "free" queue when the available buffers are
>     less than half.
> 
>     Each peer have these variables in the per-socket state:
>        /* local vars */
>        buf_alloc        /* max bytes usable by this socket
>                            [exposed to the other peer] */
>        fwd_cnt          /* increased when RX packet is consumed by the
>                            user space [exposed to the other peer] */
>        tx_cnt 	        /* increased when TX packet is sent to the other peer */
> 
>        /* remote vars  */
>        peer_buf_alloc   /* peer's buf_alloc */
>        peer_fwd_cnt     /* peer's fwd_cnt */
> 
>     When a peer sends a packet, it increases the 'tx_cnt'; when the
>     receiver consumes the packet (copy it to the user-space buffer), it
>     increases the 'fwd_cnt'.
>     Note: increments are made considering the payload length and not the
>     buffer length.
> 
>     The value of 'buf_alloc' and 'fwd_cnt' are sent to the other peer in
>     all packet headers or with an explicit CREDIT_UPDATE packet.
> 
>     The local 'buf_alloc' value can be modified by the user space using
>     setsockopt() with optname=SO_VM_SOCKETS_BUFFER_SIZE.
> 
>     Before to send a packet, the peer checks the space available:
>     	credit_available = peer_buf_alloc - (tx_cnt - peer_fwd_cnt)
>     and it will send up to credit_available bytes to the other peer.
> 
> Possible solutions considering Michael's advice:
> 1. Use the buffer length instead of the payload length when we increment
>    the counters:
>   - This approach will account precisely the memory used per socket.
>   - This requires changes in both guest and host.
>   - It is not compatible with old drivers, so a feature should be negotiated.
> 2. Decrease the advertised 'buf_alloc' taking count of bytes queued in
>    the socket queue but not used. (e.g. 256 byte used on 4K available in
>    the buffer)
>   - pkt->hdr.buf_alloc = buf_alloc - bytes_not_used.
>   - This should be compatible also with old drivers.
> 
> Maybe the second is less invasive, but will it be too tricky?
> Any other advice or suggestions?
> 
> Thanks in advance,
> Stefano

OK let me try to clarify.  The idea is this:

Let's say we queue a buffer of 4K, and we copy if len < 128 bytes.  This
means that in the worst case (128 byte packets), each byte of credit in
the socket uses up 4K/128 = 16 bytes of kernel memory. In fact we need
to also account for the virtio_vsock_pkt since I think it's kept around
until userspace consumes it.

Thus given X buf alloc allowed in the socket, we should publish X/16
credits to the other side. This will ensure the other side does not send
more than X/16 bytes for a given socket and thus we won't need to
allocate more than X bytes to hold the data.

We can play with the copy break value to tweak this.
Michael S. Tsirkin Sept. 1, 2019, 8:26 a.m. UTC | #17
On Thu, Aug 01, 2019 at 03:36:16PM +0200, Stefano Garzarella wrote:
> On Thu, Aug 01, 2019 at 09:21:15AM -0400, Michael S. Tsirkin wrote:
> > On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> > > On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> > > 
> > > (...)
> > > 
> > > > > 
> > > > > The problem here is the compatibility. Before this series virtio-vsock
> > > > > and vhost-vsock modules had the RX buffer size hard-coded
> > > > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > > > of 4K, there might be issues.
> > > > 
> > > > Shouldn't be if they are following the spec. If not let's fix
> > > > the broken parts.
> > > > 
> > > > > 
> > > > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > > > 
> > > > > Thanks,
> > > > > Stefano
> > > > 
> > > > Why would a remote care about buffer sizes?
> > > > 
> > > > Let's first see what the issues are. If they exist
> > > > we can either fix the bugs, or code the bug as a feature in spec.
> > > > 
> > > 
> > > The vhost_transport '.stream_enqueue' callback
> > > [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> > > passing the user message. This function allocates a new packet, copying
> > > the user message, but (before this series) it limits the packet size to
> > > the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> > > 
> > > static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> > > 					  struct virtio_vsock_pkt_info *info)
> > > {
> > >  ...
> > > 	/* we can send less than pkt_len bytes */
> > > 	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> > > 		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> > > 
> > > 	/* virtio_transport_get_credit might return less than pkt_len credit */
> > > 	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> > > 
> > > 	/* Do not send zero length OP_RW pkt */
> > > 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> > > 		return pkt_len;
> > >  ...
> > > }
> > > 
> > > then it queues the packet for the TX worker calling .send_pkt()
> > > [vhost_transport_send_pkt() in the vhost_transport case]
> > > 
> > > The main function executed by the TX worker is
> > > vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> > > and it tries to copy the packet (up to 4K) on it.  If the buffer
> > > allocated from the guest will be smaller then 4K, I think here it will
> > > be discarded with an error:
> > > 
> 
> I'm adding more lines to explain better.
> 
> > > static void
> > > vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> > > 				struct vhost_virtqueue *vq)
> > > {
> 		...
> 
> 		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> 					 &out, &in, NULL, NULL);
> 
> 		...
> 
> 		len = iov_length(&vq->iov[out], in);
> 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
> 
> 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
> 		if (nbytes != sizeof(pkt->hdr)) {
> 			virtio_transport_free_pkt(pkt);
> 			vq_err(vq, "Faulted on copying pkt hdr\n");
> 			break;
> 		}
> 
> > >  ...
> > > 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> > 
> > isn't pck len the actual length though?
> > 
> 
> It is the length of the packet that we are copying in the guest RX
> buffers pointed by the iov_iter. The guest allocates an iovec with 2
> buffers, one for the header and one for the payload (4KB).

BTW at the moment that forces another kmalloc within virtio core. Maybe
vsock needs a flag to skip allocation in this case.  Worth benchmarking.
See virtqueue_use_indirect which just does total_sg > 1.

> 
> > > 		if (nbytes != pkt->len) {
> > > 			virtio_transport_free_pkt(pkt);
> > > 			vq_err(vq, "Faulted on copying pkt buf\n");
> > > 			break;
> > > 		}
> > >  ...
> > > }
> > > 
> > > 
> > > This series changes this behavior since now we will split the packet in
> > > vhost_transport_do_send_pkt() depending on the buffer found in the
> > > virtqueue.
> > > 
> > > We didn't change the buffer size in this series, so we still backward
> > > compatible, but if we will use buffers smaller than 4K, we should
> > > encounter the error described above.

So that's an implementation bug then? It made an assumption
of a 4K sized buffer? Or even PAGE_SIZE sized buffer?


> > > 
> > > How do you suggest we proceed if we want to change the buffer size?
> > > Maybe adding a feature to "support any buffer size"?
> > > 
> > > Thanks,
> > > Stefano
> > 
> > 
> 
> --
Michael S. Tsirkin Sept. 1, 2019, 10:17 a.m. UTC | #18
On Sun, Sep 01, 2019 at 04:26:19AM -0400, Michael S. Tsirkin wrote:
> On Thu, Aug 01, 2019 at 03:36:16PM +0200, Stefano Garzarella wrote:
> > On Thu, Aug 01, 2019 at 09:21:15AM -0400, Michael S. Tsirkin wrote:
> > > On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> > > > On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > > > > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> > > > 
> > > > (...)
> > > > 
> > > > > > 
> > > > > > The problem here is the compatibility. Before this series virtio-vsock
> > > > > > and vhost-vsock modules had the RX buffer size hard-coded
> > > > > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > > > > of 4K, there might be issues.
> > > > > 
> > > > > Shouldn't be if they are following the spec. If not let's fix
> > > > > the broken parts.
> > > > > 
> > > > > > 
> > > > > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > > > > 
> > > > > > Thanks,
> > > > > > Stefano
> > > > > 
> > > > > Why would a remote care about buffer sizes?
> > > > > 
> > > > > Let's first see what the issues are. If they exist
> > > > > we can either fix the bugs, or code the bug as a feature in spec.
> > > > > 
> > > > 
> > > > The vhost_transport '.stream_enqueue' callback
> > > > [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> > > > passing the user message. This function allocates a new packet, copying
> > > > the user message, but (before this series) it limits the packet size to
> > > > the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> > > > 
> > > > static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> > > > 					  struct virtio_vsock_pkt_info *info)
> > > > {
> > > >  ...
> > > > 	/* we can send less than pkt_len bytes */
> > > > 	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> > > > 		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> > > > 
> > > > 	/* virtio_transport_get_credit might return less than pkt_len credit */
> > > > 	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> > > > 
> > > > 	/* Do not send zero length OP_RW pkt */
> > > > 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> > > > 		return pkt_len;
> > > >  ...
> > > > }
> > > > 
> > > > then it queues the packet for the TX worker calling .send_pkt()
> > > > [vhost_transport_send_pkt() in the vhost_transport case]
> > > > 
> > > > The main function executed by the TX worker is
> > > > vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> > > > and it tries to copy the packet (up to 4K) on it.  If the buffer
> > > > allocated from the guest will be smaller then 4K, I think here it will
> > > > be discarded with an error:
> > > > 
> > 
> > I'm adding more lines to explain better.
> > 
> > > > static void
> > > > vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> > > > 				struct vhost_virtqueue *vq)
> > > > {
> > 		...
> > 
> > 		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> > 					 &out, &in, NULL, NULL);
> > 
> > 		...
> > 
> > 		len = iov_length(&vq->iov[out], in);
> > 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
> > 
> > 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
> > 		if (nbytes != sizeof(pkt->hdr)) {
> > 			virtio_transport_free_pkt(pkt);
> > 			vq_err(vq, "Faulted on copying pkt hdr\n");
> > 			break;
> > 		}
> > 
> > > >  ...
> > > > 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> > > 
> > > isn't pck len the actual length though?
> > > 
> > 
> > It is the length of the packet that we are copying in the guest RX
> > buffers pointed by the iov_iter. The guest allocates an iovec with 2
> > buffers, one for the header and one for the payload (4KB).
> 
> BTW at the moment that forces another kmalloc within virtio core. Maybe
> vsock needs a flag to skip allocation in this case.  Worth benchmarking.
> See virtqueue_use_indirect which just does total_sg > 1.
> 
> > 
> > > > 		if (nbytes != pkt->len) {
> > > > 			virtio_transport_free_pkt(pkt);
> > > > 			vq_err(vq, "Faulted on copying pkt buf\n");
> > > > 			break;
> > > > 		}
> > > >  ...
> > > > }
> > > > 
> > > > 
> > > > This series changes this behavior since now we will split the packet in
> > > > vhost_transport_do_send_pkt() depending on the buffer found in the
> > > > virtqueue.
> > > > 
> > > > We didn't change the buffer size in this series, so we still backward
> > > > compatible, but if we will use buffers smaller than 4K, we should
> > > > encounter the error described above.
> 
> So that's an implementation bug then? It made an assumption
> of a 4K sized buffer? Or even PAGE_SIZE sized buffer?

Assuming we miss nothing and buffers < 4K are broken,
I think we need to add this to the spec, possibly with
a feature bit to relax the requirement that all buffers
are at least 4k in size.

> 
> > > > 
> > > > How do you suggest we proceed if we want to change the buffer size?
> > > > Maybe adding a feature to "support any buffer size"?
> > > > 
> > > > Thanks,
> > > > Stefano
> > > 
> > > 
> > 
> > --
Stefan Hajnoczi Sept. 2, 2019, 8:39 a.m. UTC | #19
On Sun, Sep 01, 2019 at 02:56:44AM -0400, Michael S. Tsirkin wrote:
> On Fri, Aug 30, 2019 at 11:40:59AM +0200, Stefano Garzarella wrote:
> > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > and pushed to the guest using the vring, are directly queued in
> > > > a per-socket list. These buffers are preallocated by the guest
> > > > with a fixed size (4 KB).
> > > > 
> > > > The maximum amount of memory used by each socket should be
> > > > controlled by the credit mechanism.
> > > > The default credit available per-socket is 256 KB, but if we use
> > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > to avoid starvation of other sockets.
> > > > 
> > > > This patch mitigates this issue copying the payload of small
> > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > order to avoid wasting memory.
> > > > 
> > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > 
> > > This is good enough for net-next, but for net I think we
> > > should figure out how to address the issue completely.
> > > Can we make the accounting precise? What happens to
> > > performance if we do?
> > > 
> > 
> > Since I'm back from holidays, I'm restarting this thread to figure out
> > how to address the issue completely.
> > 
> > I did a better analysis of the credit mechanism that we implemented in
> > virtio-vsock to get a clearer view and I'd share it with you:
> > 
> >     This issue affect only the "host->guest" path. In this case, when the
> >     host wants to send a packet to the guest, it uses a "free" buffer
> >     allocated by the guest (4KB).
> >     The "free" buffers available for the host are shared between all
> >     sockets, instead, the credit mechanism is per-socket, I think to
> >     avoid the starvation of others sockets.
> >     The guests re-fill the "free" queue when the available buffers are
> >     less than half.
> > 
> >     Each peer have these variables in the per-socket state:
> >        /* local vars */
> >        buf_alloc        /* max bytes usable by this socket
> >                            [exposed to the other peer] */
> >        fwd_cnt          /* increased when RX packet is consumed by the
> >                            user space [exposed to the other peer] */
> >        tx_cnt 	        /* increased when TX packet is sent to the other peer */
> > 
> >        /* remote vars  */
> >        peer_buf_alloc   /* peer's buf_alloc */
> >        peer_fwd_cnt     /* peer's fwd_cnt */
> > 
> >     When a peer sends a packet, it increases the 'tx_cnt'; when the
> >     receiver consumes the packet (copy it to the user-space buffer), it
> >     increases the 'fwd_cnt'.
> >     Note: increments are made considering the payload length and not the
> >     buffer length.
> > 
> >     The value of 'buf_alloc' and 'fwd_cnt' are sent to the other peer in
> >     all packet headers or with an explicit CREDIT_UPDATE packet.
> > 
> >     The local 'buf_alloc' value can be modified by the user space using
> >     setsockopt() with optname=SO_VM_SOCKETS_BUFFER_SIZE.
> > 
> >     Before to send a packet, the peer checks the space available:
> >     	credit_available = peer_buf_alloc - (tx_cnt - peer_fwd_cnt)
> >     and it will send up to credit_available bytes to the other peer.
> > 
> > Possible solutions considering Michael's advice:
> > 1. Use the buffer length instead of the payload length when we increment
> >    the counters:
> >   - This approach will account precisely the memory used per socket.
> >   - This requires changes in both guest and host.
> >   - It is not compatible with old drivers, so a feature should be negotiated.
> > 2. Decrease the advertised 'buf_alloc' taking count of bytes queued in
> >    the socket queue but not used. (e.g. 256 byte used on 4K available in
> >    the buffer)
> >   - pkt->hdr.buf_alloc = buf_alloc - bytes_not_used.
> >   - This should be compatible also with old drivers.
> > 
> > Maybe the second is less invasive, but will it be too tricky?
> > Any other advice or suggestions?
> > 
> > Thanks in advance,
> > Stefano
> 
> OK let me try to clarify.  The idea is this:
> 
> Let's say we queue a buffer of 4K, and we copy if len < 128 bytes.  This
> means that in the worst case (128 byte packets), each byte of credit in
> the socket uses up 4K/128 = 16 bytes of kernel memory. In fact we need
> to also account for the virtio_vsock_pkt since I think it's kept around
> until userspace consumes it.
> 
> Thus given X buf alloc allowed in the socket, we should publish X/16
> credits to the other side. This will ensure the other side does not send
> more than X/16 bytes for a given socket and thus we won't need to
> allocate more than X bytes to hold the data.
> 
> We can play with the copy break value to tweak this.

This seems like a reasonable solution.  Hopefully the benchmark results
will come out okay too.

Stefan
Stefano Garzarella Sept. 2, 2019, 8:55 a.m. UTC | #20
On Mon, Sep 02, 2019 at 09:39:12AM +0100, Stefan Hajnoczi wrote:
> On Sun, Sep 01, 2019 at 02:56:44AM -0400, Michael S. Tsirkin wrote:
> > 
> > OK let me try to clarify.  The idea is this:
> > 
> > Let's say we queue a buffer of 4K, and we copy if len < 128 bytes.  This
> > means that in the worst case (128 byte packets), each byte of credit in
> > the socket uses up 4K/128 = 16 bytes of kernel memory. In fact we need
> > to also account for the virtio_vsock_pkt since I think it's kept around
> > until userspace consumes it.
> > 
> > Thus given X buf alloc allowed in the socket, we should publish X/16
> > credits to the other side. This will ensure the other side does not send
> > more than X/16 bytes for a given socket and thus we won't need to
> > allocate more than X bytes to hold the data.
> > 
> > We can play with the copy break value to tweak this.

Thanks Michael, now it is perfectly clear. It seems an excellent solution and
easy to implement. I'll work on that.

> 
> This seems like a reasonable solution.  Hopefully the benchmark results
> will come out okay too.

Yes, as Michael suggested I'll play with the copy break value to see as
benchmark has affected.

Thank you very much,
Stefano
Stefano Garzarella Sept. 2, 2019, 9:57 a.m. UTC | #21
On Sun, Sep 01, 2019 at 06:17:58AM -0400, Michael S. Tsirkin wrote:
> On Sun, Sep 01, 2019 at 04:26:19AM -0400, Michael S. Tsirkin wrote:
> > On Thu, Aug 01, 2019 at 03:36:16PM +0200, Stefano Garzarella wrote:
> > > On Thu, Aug 01, 2019 at 09:21:15AM -0400, Michael S. Tsirkin wrote:
> > > > On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> > > > > On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > > > > > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> > > > > 
> > > > > (...)
> > > > > 
> > > > > > > 
> > > > > > > The problem here is the compatibility. Before this series virtio-vsock
> > > > > > > and vhost-vsock modules had the RX buffer size hard-coded
> > > > > > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > > > > > of 4K, there might be issues.
> > > > > > 
> > > > > > Shouldn't be if they are following the spec. If not let's fix
> > > > > > the broken parts.
> > > > > > 
> > > > > > > 
> > > > > > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > > > > > 
> > > > > > > Thanks,
> > > > > > > Stefano
> > > > > > 
> > > > > > Why would a remote care about buffer sizes?
> > > > > > 
> > > > > > Let's first see what the issues are. If they exist
> > > > > > we can either fix the bugs, or code the bug as a feature in spec.
> > > > > > 
> > > > > 
> > > > > The vhost_transport '.stream_enqueue' callback
> > > > > [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> > > > > passing the user message. This function allocates a new packet, copying
> > > > > the user message, but (before this series) it limits the packet size to
> > > > > the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> > > > > 
> > > > > static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> > > > > 					  struct virtio_vsock_pkt_info *info)
> > > > > {
> > > > >  ...
> > > > > 	/* we can send less than pkt_len bytes */
> > > > > 	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> > > > > 		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> > > > > 
> > > > > 	/* virtio_transport_get_credit might return less than pkt_len credit */
> > > > > 	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> > > > > 
> > > > > 	/* Do not send zero length OP_RW pkt */
> > > > > 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> > > > > 		return pkt_len;
> > > > >  ...
> > > > > }
> > > > > 
> > > > > then it queues the packet for the TX worker calling .send_pkt()
> > > > > [vhost_transport_send_pkt() in the vhost_transport case]
> > > > > 
> > > > > The main function executed by the TX worker is
> > > > > vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> > > > > and it tries to copy the packet (up to 4K) on it.  If the buffer
> > > > > allocated from the guest will be smaller then 4K, I think here it will
> > > > > be discarded with an error:
> > > > > 
> > > 
> > > I'm adding more lines to explain better.
> > > 
> > > > > static void
> > > > > vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> > > > > 				struct vhost_virtqueue *vq)
> > > > > {
> > > 		...
> > > 
> > > 		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> > > 					 &out, &in, NULL, NULL);
> > > 
> > > 		...
> > > 
> > > 		len = iov_length(&vq->iov[out], in);
> > > 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
> > > 
> > > 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
> > > 		if (nbytes != sizeof(pkt->hdr)) {
> > > 			virtio_transport_free_pkt(pkt);
> > > 			vq_err(vq, "Faulted on copying pkt hdr\n");
> > > 			break;
> > > 		}
> > > 
> > > > >  ...
> > > > > 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> > > > 
> > > > isn't pck len the actual length though?
> > > > 
> > > 
> > > It is the length of the packet that we are copying in the guest RX
> > > buffers pointed by the iov_iter. The guest allocates an iovec with 2
> > > buffers, one for the header and one for the payload (4KB).
> > 
> > BTW at the moment that forces another kmalloc within virtio core. Maybe
> > vsock needs a flag to skip allocation in this case.  Worth benchmarking.
> > See virtqueue_use_indirect which just does total_sg > 1.

Okay, I'll take a look at virtqueue_use_indirect and I'll do some
benchmarking.

> > 
> > > 
> > > > > 		if (nbytes != pkt->len) {
> > > > > 			virtio_transport_free_pkt(pkt);
> > > > > 			vq_err(vq, "Faulted on copying pkt buf\n");
> > > > > 			break;
> > > > > 		}
> > > > >  ...
> > > > > }
> > > > > 
> > > > > 
> > > > > This series changes this behavior since now we will split the packet in
> > > > > vhost_transport_do_send_pkt() depending on the buffer found in the
> > > > > virtqueue.
> > > > > 
> > > > > We didn't change the buffer size in this series, so we still backward
> > > > > compatible, but if we will use buffers smaller than 4K, we should
> > > > > encounter the error described above.
> > 
> > So that's an implementation bug then? It made an assumption
> > of a 4K sized buffer? Or even PAGE_SIZE sized buffer?

Yes, I think it made an assumption and it used this macro as a limit:

include/linux/virtio_vsock.h:13:
    #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE        (1024 * 4)

> 
> Assuming we miss nothing and buffers < 4K are broken,
> I think we need to add this to the spec, possibly with
> a feature bit to relax the requirement that all buffers
> are at least 4k in size.
> 

Okay, should I send a proposal to virtio-dev@lists.oasis-open.org?

Thanks,
Stefano
Michael S. Tsirkin Sept. 2, 2019, 3:23 p.m. UTC | #22
On Mon, Sep 02, 2019 at 11:57:23AM +0200, Stefano Garzarella wrote:
> On Sun, Sep 01, 2019 at 06:17:58AM -0400, Michael S. Tsirkin wrote:
> > On Sun, Sep 01, 2019 at 04:26:19AM -0400, Michael S. Tsirkin wrote:
> > > On Thu, Aug 01, 2019 at 03:36:16PM +0200, Stefano Garzarella wrote:
> > > > On Thu, Aug 01, 2019 at 09:21:15AM -0400, Michael S. Tsirkin wrote:
> > > > > On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> > > > > > On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > > > > > > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> > > > > > 
> > > > > > (...)
> > > > > > 
> > > > > > > > 
> > > > > > > > The problem here is the compatibility. Before this series virtio-vsock
> > > > > > > > and vhost-vsock modules had the RX buffer size hard-coded
> > > > > > > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > > > > > > of 4K, there might be issues.
> > > > > > > 
> > > > > > > Shouldn't be if they are following the spec. If not let's fix
> > > > > > > the broken parts.
> > > > > > > 
> > > > > > > > 
> > > > > > > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > > > > > > 
> > > > > > > > Thanks,
> > > > > > > > Stefano
> > > > > > > 
> > > > > > > Why would a remote care about buffer sizes?
> > > > > > > 
> > > > > > > Let's first see what the issues are. If they exist
> > > > > > > we can either fix the bugs, or code the bug as a feature in spec.
> > > > > > > 
> > > > > > 
> > > > > > The vhost_transport '.stream_enqueue' callback
> > > > > > [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> > > > > > passing the user message. This function allocates a new packet, copying
> > > > > > the user message, but (before this series) it limits the packet size to
> > > > > > the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> > > > > > 
> > > > > > static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> > > > > > 					  struct virtio_vsock_pkt_info *info)
> > > > > > {
> > > > > >  ...
> > > > > > 	/* we can send less than pkt_len bytes */
> > > > > > 	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> > > > > > 		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> > > > > > 
> > > > > > 	/* virtio_transport_get_credit might return less than pkt_len credit */
> > > > > > 	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> > > > > > 
> > > > > > 	/* Do not send zero length OP_RW pkt */
> > > > > > 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> > > > > > 		return pkt_len;
> > > > > >  ...
> > > > > > }
> > > > > > 
> > > > > > then it queues the packet for the TX worker calling .send_pkt()
> > > > > > [vhost_transport_send_pkt() in the vhost_transport case]
> > > > > > 
> > > > > > The main function executed by the TX worker is
> > > > > > vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> > > > > > and it tries to copy the packet (up to 4K) on it.  If the buffer
> > > > > > allocated from the guest will be smaller then 4K, I think here it will
> > > > > > be discarded with an error:
> > > > > > 
> > > > 
> > > > I'm adding more lines to explain better.
> > > > 
> > > > > > static void
> > > > > > vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> > > > > > 				struct vhost_virtqueue *vq)
> > > > > > {
> > > > 		...
> > > > 
> > > > 		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> > > > 					 &out, &in, NULL, NULL);
> > > > 
> > > > 		...
> > > > 
> > > > 		len = iov_length(&vq->iov[out], in);
> > > > 		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
> > > > 
> > > > 		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
> > > > 		if (nbytes != sizeof(pkt->hdr)) {
> > > > 			virtio_transport_free_pkt(pkt);
> > > > 			vq_err(vq, "Faulted on copying pkt hdr\n");
> > > > 			break;
> > > > 		}
> > > > 
> > > > > >  ...
> > > > > > 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> > > > > 
> > > > > isn't pck len the actual length though?
> > > > > 
> > > > 
> > > > It is the length of the packet that we are copying in the guest RX
> > > > buffers pointed by the iov_iter. The guest allocates an iovec with 2
> > > > buffers, one for the header and one for the payload (4KB).
> > > 
> > > BTW at the moment that forces another kmalloc within virtio core. Maybe
> > > vsock needs a flag to skip allocation in this case.  Worth benchmarking.
> > > See virtqueue_use_indirect which just does total_sg > 1.
> 
> Okay, I'll take a look at virtqueue_use_indirect and I'll do some
> benchmarking.
> 
> > > 
> > > > 
> > > > > > 		if (nbytes != pkt->len) {
> > > > > > 			virtio_transport_free_pkt(pkt);
> > > > > > 			vq_err(vq, "Faulted on copying pkt buf\n");
> > > > > > 			break;
> > > > > > 		}
> > > > > >  ...
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > This series changes this behavior since now we will split the packet in
> > > > > > vhost_transport_do_send_pkt() depending on the buffer found in the
> > > > > > virtqueue.
> > > > > > 
> > > > > > We didn't change the buffer size in this series, so we still backward
> > > > > > compatible, but if we will use buffers smaller than 4K, we should
> > > > > > encounter the error described above.
> > > 
> > > So that's an implementation bug then? It made an assumption
> > > of a 4K sized buffer? Or even PAGE_SIZE sized buffer?
> 
> Yes, I think it made an assumption and it used this macro as a limit:
> 
> include/linux/virtio_vsock.h:13:
>     #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE        (1024 * 4)
> 
> > 
> > Assuming we miss nothing and buffers < 4K are broken,
> > I think we need to add this to the spec, possibly with
> > a feature bit to relax the requirement that all buffers
> > are at least 4k in size.
> > 
> 
> Okay, should I send a proposal to virtio-dev@lists.oasis-open.org?
> 
> Thanks,
> Stefano

virtio-comment is more appropriate for this.
Michael S. Tsirkin Sept. 3, 2019, 4:39 a.m. UTC | #23
On Mon, Sep 02, 2019 at 11:57:23AM +0200, Stefano Garzarella wrote:
> > 
> > Assuming we miss nothing and buffers < 4K are broken,
> > I think we need to add this to the spec, possibly with
> > a feature bit to relax the requirement that all buffers
> > are at least 4k in size.
> > 
> 
> Okay, should I send a proposal to virtio-dev@lists.oasis-open.org?

How about we also fix the bug for now?
Stefano Garzarella Sept. 3, 2019, 7:45 a.m. UTC | #24
On Tue, Sep 03, 2019 at 12:39:19AM -0400, Michael S. Tsirkin wrote:
> On Mon, Sep 02, 2019 at 11:57:23AM +0200, Stefano Garzarella wrote:
> > > 
> > > Assuming we miss nothing and buffers < 4K are broken,
> > > I think we need to add this to the spec, possibly with
> > > a feature bit to relax the requirement that all buffers
> > > are at least 4k in size.
> > > 
> > 
> > Okay, should I send a proposal to virtio-dev@lists.oasis-open.org?
> 
> How about we also fix the bug for now?

This series unintentionally fix the bug because we are introducing a way
to split the packet depending on the buffer size ([PATCH 4/5] vhost/vsock:
split packets to send using multiple buffers) and we removed the limit
to 4K buffers ([PATCH 5/5] vsock/virtio: change the maximum packet size
allowed).

I discovered that there was a bug while we discussed memory accounting.

Do you think it's enough while we introduce the feature bit in the spec?

Thanks,
Stefano
Michael S. Tsirkin Sept. 3, 2019, 7:52 a.m. UTC | #25
On Tue, Sep 03, 2019 at 09:45:54AM +0200, Stefano Garzarella wrote:
> On Tue, Sep 03, 2019 at 12:39:19AM -0400, Michael S. Tsirkin wrote:
> > On Mon, Sep 02, 2019 at 11:57:23AM +0200, Stefano Garzarella wrote:
> > > > 
> > > > Assuming we miss nothing and buffers < 4K are broken,
> > > > I think we need to add this to the spec, possibly with
> > > > a feature bit to relax the requirement that all buffers
> > > > are at least 4k in size.
> > > > 
> > > 
> > > Okay, should I send a proposal to virtio-dev@lists.oasis-open.org?
> > 
> > How about we also fix the bug for now?
> 
> This series unintentionally fix the bug because we are introducing a way
> to split the packet depending on the buffer size ([PATCH 4/5] vhost/vsock:
> split packets to send using multiple buffers) and we removed the limit
> to 4K buffers ([PATCH 5/5] vsock/virtio: change the maximum packet size
> allowed).
> 
> I discovered that there was a bug while we discussed memory accounting.
> 
> Do you think it's enough while we introduce the feature bit in the spec?
> 
> Thanks,
> Stefano

Well locking is also broken (patch 3/5).  It seems that 3/5 and 4/5 work
by themselves, right?  So how about we ask Dave to send these to stable?
Also, how about 1/5? Also needed for stable?
Stefano Garzarella Sept. 3, 2019, 8 a.m. UTC | #26
On Tue, Sep 03, 2019 at 03:52:24AM -0400, Michael S. Tsirkin wrote:
> On Tue, Sep 03, 2019 at 09:45:54AM +0200, Stefano Garzarella wrote:
> > On Tue, Sep 03, 2019 at 12:39:19AM -0400, Michael S. Tsirkin wrote:
> > > On Mon, Sep 02, 2019 at 11:57:23AM +0200, Stefano Garzarella wrote:
> > > > > 
> > > > > Assuming we miss nothing and buffers < 4K are broken,
> > > > > I think we need to add this to the spec, possibly with
> > > > > a feature bit to relax the requirement that all buffers
> > > > > are at least 4k in size.
> > > > > 
> > > > 
> > > > Okay, should I send a proposal to virtio-dev@lists.oasis-open.org?
> > > 
> > > How about we also fix the bug for now?
> > 
> > This series unintentionally fix the bug because we are introducing a way
> > to split the packet depending on the buffer size ([PATCH 4/5] vhost/vsock:
> > split packets to send using multiple buffers) and we removed the limit
> > to 4K buffers ([PATCH 5/5] vsock/virtio: change the maximum packet size
> > allowed).
> > 
> > I discovered that there was a bug while we discussed memory accounting.
> > 
> > Do you think it's enough while we introduce the feature bit in the spec?
> > 
> > Thanks,
> > Stefano
> 
> Well locking is also broken (patch 3/5).  It seems that 3/5 and 4/5 work
> by themselves, right?  So how about we ask Dave to send these to stable?

Yes, they work by themselves and I agree that should be send to stable.

> Also, how about 1/5? Also needed for stable?

I think so, without this patch if we flood the guest with 1-byte packets,
we can consume ~ 1 GB of guest memory per-socket.

Thanks,
Stefano
Stefano Garzarella Oct. 11, 2019, 1:40 p.m. UTC | #27
On Sun, Sep 1, 2019 at 8:56 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> On Fri, Aug 30, 2019 at 11:40:59AM +0200, Stefano Garzarella wrote:
> > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > and pushed to the guest using the vring, are directly queued in
> > > > a per-socket list. These buffers are preallocated by the guest
> > > > with a fixed size (4 KB).
> > > >
> > > > The maximum amount of memory used by each socket should be
> > > > controlled by the credit mechanism.
> > > > The default credit available per-socket is 256 KB, but if we use
> > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > to avoid starvation of other sockets.
> > > >
> > > > This patch mitigates this issue copying the payload of small
> > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > order to avoid wasting memory.
> > > >
> > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > >
> > > This is good enough for net-next, but for net I think we
> > > should figure out how to address the issue completely.
> > > Can we make the accounting precise? What happens to
> > > performance if we do?
> > >
> >
> > Since I'm back from holidays, I'm restarting this thread to figure out
> > how to address the issue completely.
> >
> > I did a better analysis of the credit mechanism that we implemented in
> > virtio-vsock to get a clearer view and I'd share it with you:
> >
> >     This issue affect only the "host->guest" path. In this case, when the
> >     host wants to send a packet to the guest, it uses a "free" buffer
> >     allocated by the guest (4KB).
> >     The "free" buffers available for the host are shared between all
> >     sockets, instead, the credit mechanism is per-socket, I think to
> >     avoid the starvation of others sockets.
> >     The guests re-fill the "free" queue when the available buffers are
> >     less than half.
> >
> >     Each peer have these variables in the per-socket state:
> >        /* local vars */
> >        buf_alloc        /* max bytes usable by this socket
> >                            [exposed to the other peer] */
> >        fwd_cnt          /* increased when RX packet is consumed by the
> >                            user space [exposed to the other peer] */
> >        tx_cnt                 /* increased when TX packet is sent to the other peer */
> >
> >        /* remote vars  */
> >        peer_buf_alloc   /* peer's buf_alloc */
> >        peer_fwd_cnt     /* peer's fwd_cnt */
> >
> >     When a peer sends a packet, it increases the 'tx_cnt'; when the
> >     receiver consumes the packet (copy it to the user-space buffer), it
> >     increases the 'fwd_cnt'.
> >     Note: increments are made considering the payload length and not the
> >     buffer length.
> >
> >     The value of 'buf_alloc' and 'fwd_cnt' are sent to the other peer in
> >     all packet headers or with an explicit CREDIT_UPDATE packet.
> >
> >     The local 'buf_alloc' value can be modified by the user space using
> >     setsockopt() with optname=SO_VM_SOCKETS_BUFFER_SIZE.
> >
> >     Before to send a packet, the peer checks the space available:
> >       credit_available = peer_buf_alloc - (tx_cnt - peer_fwd_cnt)
> >     and it will send up to credit_available bytes to the other peer.
> >
> > Possible solutions considering Michael's advice:
> > 1. Use the buffer length instead of the payload length when we increment
> >    the counters:
> >   - This approach will account precisely the memory used per socket.
> >   - This requires changes in both guest and host.
> >   - It is not compatible with old drivers, so a feature should be negotiated.
> > 2. Decrease the advertised 'buf_alloc' taking count of bytes queued in
> >    the socket queue but not used. (e.g. 256 byte used on 4K available in
> >    the buffer)
> >   - pkt->hdr.buf_alloc = buf_alloc - bytes_not_used.
> >   - This should be compatible also with old drivers.
> >
> > Maybe the second is less invasive, but will it be too tricky?
> > Any other advice or suggestions?
> >
> > Thanks in advance,
> > Stefano
>
> OK let me try to clarify.  The idea is this:
>
> Let's say we queue a buffer of 4K, and we copy if len < 128 bytes.  This
> means that in the worst case (128 byte packets), each byte of credit in
> the socket uses up 4K/128 = 16 bytes of kernel memory. In fact we need
> to also account for the virtio_vsock_pkt since I think it's kept around
> until userspace consumes it.
>
> Thus given X buf alloc allowed in the socket, we should publish X/16
> credits to the other side. This will ensure the other side does not send
> more than X/16 bytes for a given socket and thus we won't need to
> allocate more than X bytes to hold the data.
>
> We can play with the copy break value to tweak this.
>

Hi Michael,
sorry for the long silence, but I focused on multi-transport.

Before to implement your idea, I tried to do some calculations and
looking better to our credit mechanism:

  buf_alloc = 256 KB (default, tunable through setsockopt)
  sizeof(struct virtio_vsock_pkt) = 128

  - guest (we use preallocated 4 KB buffers to receive packets, copying
    small packet - < 128 -)
    worst_case = 129
    buf_size = 4 KB
    credit2mem = (buf_size + sizeof(struct virtio_vsock_pkt)) / worst_case = 32

    credit_published = buf_alloc / credit2mem = ~8 KB
    Space for just 2 full packet (4 KB)

  - host (we copy packets from the vring, allocating the space for the payload)
    worst_case = 1
    buf_size = 1
    credit2mem = (buf_size + sizeof(struct virtio_vsock_pkt)) / worst_case = 129

    credit_published = buf_alloc / credit2mem = ~2 KB
    Less than a full packet (guest now can send up to 64 KB with a single
    packet, so it will be limited to 2 KB)

Current memory consumption in the worst case if the RX queue is full:
  - guest
    mem = (buf_alloc / worst_case) *
          (buf_size + sizeof(struct virtio_vsock_pkt) = ~8MB

  - host
    mem = (buf_alloc / worst_case) *
          (buf_size + sizeof(struct virtio_vsock_pkt) = ~32MB

I think that the performance with big packets will be affected,
but I still have to try.

Another approach that I want to explore is to play with buf_alloc
published to the peer.

One thing that's not clear to me yet is the meaning of
SO_VM_SOCKETS_BUFFER_SIZE:
- max amount of memory used in the RX queue
- max amount of payload bytes in the RX queue (without overhead of
  struct virtio_vsock_pkt + preallocated buffer)

From the 'include/uapi/linux/vm_sockets.h':
    /* Option name for STREAM socket buffer size.  Use as the option name in
     * setsockopt(3) or getsockopt(3) to set or get an unsigned long long that
     * specifies the size of the buffer underlying a vSockets STREAM socket.
     * Value is clamped to the MIN and MAX.
     */

    #define SO_VM_SOCKETS_BUFFER_SIZE 0

Regardless, I think we need to limit memory consumption in some way.
I'll check the implementation of other transports, to understand better.

I'll keep you updated!

Thanks,
Stefano
Michael S. Tsirkin Oct. 11, 2019, 2:11 p.m. UTC | #28
On Fri, Oct 11, 2019 at 03:40:48PM +0200, Stefano Garzarella wrote:
> On Sun, Sep 1, 2019 at 8:56 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Fri, Aug 30, 2019 at 11:40:59AM +0200, Stefano Garzarella wrote:
> > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > and pushed to the guest using the vring, are directly queued in
> > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > with a fixed size (4 KB).
> > > > >
> > > > > The maximum amount of memory used by each socket should be
> > > > > controlled by the credit mechanism.
> > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > to avoid starvation of other sockets.
> > > > >
> > > > > This patch mitigates this issue copying the payload of small
> > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > order to avoid wasting memory.
> > > > >
> > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > >
> > > > This is good enough for net-next, but for net I think we
> > > > should figure out how to address the issue completely.
> > > > Can we make the accounting precise? What happens to
> > > > performance if we do?
> > > >
> > >
> > > Since I'm back from holidays, I'm restarting this thread to figure out
> > > how to address the issue completely.
> > >
> > > I did a better analysis of the credit mechanism that we implemented in
> > > virtio-vsock to get a clearer view and I'd share it with you:
> > >
> > >     This issue affect only the "host->guest" path. In this case, when the
> > >     host wants to send a packet to the guest, it uses a "free" buffer
> > >     allocated by the guest (4KB).
> > >     The "free" buffers available for the host are shared between all
> > >     sockets, instead, the credit mechanism is per-socket, I think to
> > >     avoid the starvation of others sockets.
> > >     The guests re-fill the "free" queue when the available buffers are
> > >     less than half.
> > >
> > >     Each peer have these variables in the per-socket state:
> > >        /* local vars */
> > >        buf_alloc        /* max bytes usable by this socket
> > >                            [exposed to the other peer] */
> > >        fwd_cnt          /* increased when RX packet is consumed by the
> > >                            user space [exposed to the other peer] */
> > >        tx_cnt                 /* increased when TX packet is sent to the other peer */
> > >
> > >        /* remote vars  */
> > >        peer_buf_alloc   /* peer's buf_alloc */
> > >        peer_fwd_cnt     /* peer's fwd_cnt */
> > >
> > >     When a peer sends a packet, it increases the 'tx_cnt'; when the
> > >     receiver consumes the packet (copy it to the user-space buffer), it
> > >     increases the 'fwd_cnt'.
> > >     Note: increments are made considering the payload length and not the
> > >     buffer length.
> > >
> > >     The value of 'buf_alloc' and 'fwd_cnt' are sent to the other peer in
> > >     all packet headers or with an explicit CREDIT_UPDATE packet.
> > >
> > >     The local 'buf_alloc' value can be modified by the user space using
> > >     setsockopt() with optname=SO_VM_SOCKETS_BUFFER_SIZE.
> > >
> > >     Before to send a packet, the peer checks the space available:
> > >       credit_available = peer_buf_alloc - (tx_cnt - peer_fwd_cnt)
> > >     and it will send up to credit_available bytes to the other peer.
> > >
> > > Possible solutions considering Michael's advice:
> > > 1. Use the buffer length instead of the payload length when we increment
> > >    the counters:
> > >   - This approach will account precisely the memory used per socket.
> > >   - This requires changes in both guest and host.
> > >   - It is not compatible with old drivers, so a feature should be negotiated.
> > > 2. Decrease the advertised 'buf_alloc' taking count of bytes queued in
> > >    the socket queue but not used. (e.g. 256 byte used on 4K available in
> > >    the buffer)
> > >   - pkt->hdr.buf_alloc = buf_alloc - bytes_not_used.
> > >   - This should be compatible also with old drivers.
> > >
> > > Maybe the second is less invasive, but will it be too tricky?
> > > Any other advice or suggestions?
> > >
> > > Thanks in advance,
> > > Stefano
> >
> > OK let me try to clarify.  The idea is this:
> >
> > Let's say we queue a buffer of 4K, and we copy if len < 128 bytes.  This
> > means that in the worst case (128 byte packets), each byte of credit in
> > the socket uses up 4K/128 = 16 bytes of kernel memory. In fact we need
> > to also account for the virtio_vsock_pkt since I think it's kept around
> > until userspace consumes it.
> >
> > Thus given X buf alloc allowed in the socket, we should publish X/16
> > credits to the other side. This will ensure the other side does not send
> > more than X/16 bytes for a given socket and thus we won't need to
> > allocate more than X bytes to hold the data.
> >
> > We can play with the copy break value to tweak this.
> >
> 
> Hi Michael,
> sorry for the long silence, but I focused on multi-transport.
> 
> Before to implement your idea, I tried to do some calculations and
> looking better to our credit mechanism:
> 
>   buf_alloc = 256 KB (default, tunable through setsockopt)
>   sizeof(struct virtio_vsock_pkt) = 128
> 
>   - guest (we use preallocated 4 KB buffers to receive packets, copying
>     small packet - < 128 -)
>     worst_case = 129
>     buf_size = 4 KB
>     credit2mem = (buf_size + sizeof(struct virtio_vsock_pkt)) / worst_case = 32

Making virtio_vsock_pkt smaller will help btw.
E.g. why keep a work struct per packet? do we need the header there? etc
etc.

> 
>     credit_published = buf_alloc / credit2mem = ~8 KB
>     Space for just 2 full packet (4 KB)
> 
>   - host (we copy packets from the vring, allocating the space for the payload)
>     worst_case = 1
>     buf_size = 1
>     credit2mem = (buf_size + sizeof(struct virtio_vsock_pkt)) / worst_case = 129
> 
>     credit_published = buf_alloc / credit2mem = ~2 KB
>     Less than a full packet (guest now can send up to 64 KB with a single
>     packet, so it will be limited to 2 KB)

1 byte in struct virtio_vsock_pkt is crazy. Can't we copy
like we do in the guest? E.g. allocate at least 128 bytes,
and then we can add data to the tail of the last packet?

> Current memory consumption in the worst case if the RX queue is full:
>   - guest
>     mem = (buf_alloc / worst_case) *
>           (buf_size + sizeof(struct virtio_vsock_pkt) = ~8MB
> 
>   - host
>     mem = (buf_alloc / worst_case) *
>           (buf_size + sizeof(struct virtio_vsock_pkt) = ~32MB
> 
> I think that the performance with big packets will be affected,
> but I still have to try.
> 
> Another approach that I want to explore is to play with buf_alloc
> published to the peer.
> 
> One thing that's not clear to me yet is the meaning of
> SO_VM_SOCKETS_BUFFER_SIZE:
> - max amount of memory used in the RX queue
> - max amount of payload bytes in the RX queue (without overhead of
>   struct virtio_vsock_pkt + preallocated buffer)
> 
> >From the 'include/uapi/linux/vm_sockets.h':
>     /* Option name for STREAM socket buffer size.  Use as the option name in
>      * setsockopt(3) or getsockopt(3) to set or get an unsigned long long that
>      * specifies the size of the buffer underlying a vSockets STREAM socket.
>      * Value is clamped to the MIN and MAX.
>      */
> 
>     #define SO_VM_SOCKETS_BUFFER_SIZE 0
> 
> Regardless, I think we need to limit memory consumption in some way.
> I'll check the implementation of other transports, to understand better.
> 
> I'll keep you updated!
> 
> Thanks,
> Stefano
Stefano Garzarella Oct. 11, 2019, 2:23 p.m. UTC | #29
On Fri, Oct 11, 2019 at 10:11:19AM -0400, Michael S. Tsirkin wrote:
> On Fri, Oct 11, 2019 at 03:40:48PM +0200, Stefano Garzarella wrote:
> > On Sun, Sep 1, 2019 at 8:56 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > On Fri, Aug 30, 2019 at 11:40:59AM +0200, Stefano Garzarella wrote:
> > > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > > and pushed to the guest using the vring, are directly queued in
> > > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > > with a fixed size (4 KB).
> > > > > >
> > > > > > The maximum amount of memory used by each socket should be
> > > > > > controlled by the credit mechanism.
> > > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > > to avoid starvation of other sockets.
> > > > > >
> > > > > > This patch mitigates this issue copying the payload of small
> > > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > > order to avoid wasting memory.
> > > > > >
> > > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > >
> > > > > This is good enough for net-next, but for net I think we
> > > > > should figure out how to address the issue completely.
> > > > > Can we make the accounting precise? What happens to
> > > > > performance if we do?
> > > > >
> > > >
> > > > Since I'm back from holidays, I'm restarting this thread to figure out
> > > > how to address the issue completely.
> > > >
> > > > I did a better analysis of the credit mechanism that we implemented in
> > > > virtio-vsock to get a clearer view and I'd share it with you:
> > > >
> > > >     This issue affect only the "host->guest" path. In this case, when the
> > > >     host wants to send a packet to the guest, it uses a "free" buffer
> > > >     allocated by the guest (4KB).
> > > >     The "free" buffers available for the host are shared between all
> > > >     sockets, instead, the credit mechanism is per-socket, I think to
> > > >     avoid the starvation of others sockets.
> > > >     The guests re-fill the "free" queue when the available buffers are
> > > >     less than half.
> > > >
> > > >     Each peer have these variables in the per-socket state:
> > > >        /* local vars */
> > > >        buf_alloc        /* max bytes usable by this socket
> > > >                            [exposed to the other peer] */
> > > >        fwd_cnt          /* increased when RX packet is consumed by the
> > > >                            user space [exposed to the other peer] */
> > > >        tx_cnt                 /* increased when TX packet is sent to the other peer */
> > > >
> > > >        /* remote vars  */
> > > >        peer_buf_alloc   /* peer's buf_alloc */
> > > >        peer_fwd_cnt     /* peer's fwd_cnt */
> > > >
> > > >     When a peer sends a packet, it increases the 'tx_cnt'; when the
> > > >     receiver consumes the packet (copy it to the user-space buffer), it
> > > >     increases the 'fwd_cnt'.
> > > >     Note: increments are made considering the payload length and not the
> > > >     buffer length.
> > > >
> > > >     The value of 'buf_alloc' and 'fwd_cnt' are sent to the other peer in
> > > >     all packet headers or with an explicit CREDIT_UPDATE packet.
> > > >
> > > >     The local 'buf_alloc' value can be modified by the user space using
> > > >     setsockopt() with optname=SO_VM_SOCKETS_BUFFER_SIZE.
> > > >
> > > >     Before to send a packet, the peer checks the space available:
> > > >       credit_available = peer_buf_alloc - (tx_cnt - peer_fwd_cnt)
> > > >     and it will send up to credit_available bytes to the other peer.
> > > >
> > > > Possible solutions considering Michael's advice:
> > > > 1. Use the buffer length instead of the payload length when we increment
> > > >    the counters:
> > > >   - This approach will account precisely the memory used per socket.
> > > >   - This requires changes in both guest and host.
> > > >   - It is not compatible with old drivers, so a feature should be negotiated.
> > > > 2. Decrease the advertised 'buf_alloc' taking count of bytes queued in
> > > >    the socket queue but not used. (e.g. 256 byte used on 4K available in
> > > >    the buffer)
> > > >   - pkt->hdr.buf_alloc = buf_alloc - bytes_not_used.
> > > >   - This should be compatible also with old drivers.
> > > >
> > > > Maybe the second is less invasive, but will it be too tricky?
> > > > Any other advice or suggestions?
> > > >
> > > > Thanks in advance,
> > > > Stefano
> > >
> > > OK let me try to clarify.  The idea is this:
> > >
> > > Let's say we queue a buffer of 4K, and we copy if len < 128 bytes.  This
> > > means that in the worst case (128 byte packets), each byte of credit in
> > > the socket uses up 4K/128 = 16 bytes of kernel memory. In fact we need
> > > to also account for the virtio_vsock_pkt since I think it's kept around
> > > until userspace consumes it.
> > >
> > > Thus given X buf alloc allowed in the socket, we should publish X/16
> > > credits to the other side. This will ensure the other side does not send
> > > more than X/16 bytes for a given socket and thus we won't need to
> > > allocate more than X bytes to hold the data.
> > >
> > > We can play with the copy break value to tweak this.
> > >
> > 
> > Hi Michael,
> > sorry for the long silence, but I focused on multi-transport.
> > 
> > Before to implement your idea, I tried to do some calculations and
> > looking better to our credit mechanism:
> > 
> >   buf_alloc = 256 KB (default, tunable through setsockopt)
> >   sizeof(struct virtio_vsock_pkt) = 128
> > 
> >   - guest (we use preallocated 4 KB buffers to receive packets, copying
> >     small packet - < 128 -)
> >     worst_case = 129
> >     buf_size = 4 KB
> >     credit2mem = (buf_size + sizeof(struct virtio_vsock_pkt)) / worst_case = 32
> 
> Making virtio_vsock_pkt smaller will help btw.
> E.g. why keep a work struct per packet? do we need the header there? etc
> etc.
> 

Yes, I had the same feeling, I'll try to make it smaller!


> > 
> >     credit_published = buf_alloc / credit2mem = ~8 KB
> >     Space for just 2 full packet (4 KB)
> > 
> >   - host (we copy packets from the vring, allocating the space for the payload)
> >     worst_case = 1
> >     buf_size = 1
> >     credit2mem = (buf_size + sizeof(struct virtio_vsock_pkt)) / worst_case = 129
> > 
> >     credit_published = buf_alloc / credit2mem = ~2 KB
> >     Less than a full packet (guest now can send up to 64 KB with a single
> >     packet, so it will be limited to 2 KB)
> 
> 1 byte in struct virtio_vsock_pkt is crazy. Can't we copy

I know :-)

> like we do in the guest? E.g. allocate at least 128 bytes,
> and then we can add data to the tail of the last packet?

Yes, I've been thinking about trying that out. (allocate at least 128 bytes
and do as we do in the guest).


Thanks,
Stefano
Stefan Hajnoczi Oct. 14, 2019, 8:17 a.m. UTC | #30
On Fri, Oct 11, 2019 at 03:40:48PM +0200, Stefano Garzarella wrote:
> On Sun, Sep 1, 2019 at 8:56 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Fri, Aug 30, 2019 at 11:40:59AM +0200, Stefano Garzarella wrote:
> > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > and pushed to the guest using the vring, are directly queued in
> > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > with a fixed size (4 KB).
> > > > >
> > > > > The maximum amount of memory used by each socket should be
> > > > > controlled by the credit mechanism.
> > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > to avoid starvation of other sockets.
> > > > >
> > > > > This patch mitigates this issue copying the payload of small
> > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > order to avoid wasting memory.
> > > > >
> > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > >
> > > > This is good enough for net-next, but for net I think we
> > > > should figure out how to address the issue completely.
> > > > Can we make the accounting precise? What happens to
> > > > performance if we do?
> > > >
> > >
> > > Since I'm back from holidays, I'm restarting this thread to figure out
> > > how to address the issue completely.
> > >
> > > I did a better analysis of the credit mechanism that we implemented in
> > > virtio-vsock to get a clearer view and I'd share it with you:
> > >
> > >     This issue affect only the "host->guest" path. In this case, when the
> > >     host wants to send a packet to the guest, it uses a "free" buffer
> > >     allocated by the guest (4KB).
> > >     The "free" buffers available for the host are shared between all
> > >     sockets, instead, the credit mechanism is per-socket, I think to
> > >     avoid the starvation of others sockets.
> > >     The guests re-fill the "free" queue when the available buffers are
> > >     less than half.
> > >
> > >     Each peer have these variables in the per-socket state:
> > >        /* local vars */
> > >        buf_alloc        /* max bytes usable by this socket
> > >                            [exposed to the other peer] */
> > >        fwd_cnt          /* increased when RX packet is consumed by the
> > >                            user space [exposed to the other peer] */
> > >        tx_cnt                 /* increased when TX packet is sent to the other peer */
> > >
> > >        /* remote vars  */
> > >        peer_buf_alloc   /* peer's buf_alloc */
> > >        peer_fwd_cnt     /* peer's fwd_cnt */
> > >
> > >     When a peer sends a packet, it increases the 'tx_cnt'; when the
> > >     receiver consumes the packet (copy it to the user-space buffer), it
> > >     increases the 'fwd_cnt'.
> > >     Note: increments are made considering the payload length and not the
> > >     buffer length.
> > >
> > >     The value of 'buf_alloc' and 'fwd_cnt' are sent to the other peer in
> > >     all packet headers or with an explicit CREDIT_UPDATE packet.
> > >
> > >     The local 'buf_alloc' value can be modified by the user space using
> > >     setsockopt() with optname=SO_VM_SOCKETS_BUFFER_SIZE.
> > >
> > >     Before to send a packet, the peer checks the space available:
> > >       credit_available = peer_buf_alloc - (tx_cnt - peer_fwd_cnt)
> > >     and it will send up to credit_available bytes to the other peer.
> > >
> > > Possible solutions considering Michael's advice:
> > > 1. Use the buffer length instead of the payload length when we increment
> > >    the counters:
> > >   - This approach will account precisely the memory used per socket.
> > >   - This requires changes in both guest and host.
> > >   - It is not compatible with old drivers, so a feature should be negotiated.
> > > 2. Decrease the advertised 'buf_alloc' taking count of bytes queued in
> > >    the socket queue but not used. (e.g. 256 byte used on 4K available in
> > >    the buffer)
> > >   - pkt->hdr.buf_alloc = buf_alloc - bytes_not_used.
> > >   - This should be compatible also with old drivers.
> > >
> > > Maybe the second is less invasive, but will it be too tricky?
> > > Any other advice or suggestions?
> > >
> > > Thanks in advance,
> > > Stefano
> >
> > OK let me try to clarify.  The idea is this:
> >
> > Let's say we queue a buffer of 4K, and we copy if len < 128 bytes.  This
> > means that in the worst case (128 byte packets), each byte of credit in
> > the socket uses up 4K/128 = 16 bytes of kernel memory. In fact we need
> > to also account for the virtio_vsock_pkt since I think it's kept around
> > until userspace consumes it.
> >
> > Thus given X buf alloc allowed in the socket, we should publish X/16
> > credits to the other side. This will ensure the other side does not send
> > more than X/16 bytes for a given socket and thus we won't need to
> > allocate more than X bytes to hold the data.
> >
> > We can play with the copy break value to tweak this.
> >
> 
> Hi Michael,
> sorry for the long silence, but I focused on multi-transport.
> 
> Before to implement your idea, I tried to do some calculations and
> looking better to our credit mechanism:
> 
>   buf_alloc = 256 KB (default, tunable through setsockopt)
>   sizeof(struct virtio_vsock_pkt) = 128
> 
>   - guest (we use preallocated 4 KB buffers to receive packets, copying
>     small packet - < 128 -)
>     worst_case = 129
>     buf_size = 4 KB
>     credit2mem = (buf_size + sizeof(struct virtio_vsock_pkt)) / worst_case = 32
> 
>     credit_published = buf_alloc / credit2mem = ~8 KB
>     Space for just 2 full packet (4 KB)
> 
>   - host (we copy packets from the vring, allocating the space for the payload)
>     worst_case = 1
>     buf_size = 1
>     credit2mem = (buf_size + sizeof(struct virtio_vsock_pkt)) / worst_case = 129
> 
>     credit_published = buf_alloc / credit2mem = ~2 KB
>     Less than a full packet (guest now can send up to 64 KB with a single
>     packet, so it will be limited to 2 KB)
> 
> Current memory consumption in the worst case if the RX queue is full:
>   - guest
>     mem = (buf_alloc / worst_case) *
>           (buf_size + sizeof(struct virtio_vsock_pkt) = ~8MB
> 
>   - host
>     mem = (buf_alloc / worst_case) *
>           (buf_size + sizeof(struct virtio_vsock_pkt) = ~32MB
> 
> I think that the performance with big packets will be affected,
> but I still have to try.
> 
> Another approach that I want to explore is to play with buf_alloc
> published to the peer.
> 
> One thing that's not clear to me yet is the meaning of
> SO_VM_SOCKETS_BUFFER_SIZE:
> - max amount of memory used in the RX queue
> - max amount of payload bytes in the RX queue (without overhead of
>   struct virtio_vsock_pkt + preallocated buffer)
> 
> From the 'include/uapi/linux/vm_sockets.h':
>     /* Option name for STREAM socket buffer size.  Use as the option name in
>      * setsockopt(3) or getsockopt(3) to set or get an unsigned long long that
>      * specifies the size of the buffer underlying a vSockets STREAM socket.
>      * Value is clamped to the MIN and MAX.
>      */
> 
>     #define SO_VM_SOCKETS_BUFFER_SIZE 0
> 
> Regardless, I think we need to limit memory consumption in some way.
> I'll check the implementation of other transports, to understand better.

SO_VM_SOCKETS_BUFFER_SIZE might have been useful for VMCI-specific
applications, but we should use SO_RCVBUF and SO_SNDBUF for portable
applications in the future.  Those socket options also work with other
address families.

I guess these sockopts are bypassed by AF_VSOCK because it doesn't use
the common skb queuing code in net/core/sock.c :(.  But one day we might
migrate to it...

Stefan
Jason Wang Oct. 14, 2019, 8:21 a.m. UTC | #31
On 2019/10/14 下午4:17, Stefan Hajnoczi wrote:
> SO_VM_SOCKETS_BUFFER_SIZE might have been useful for VMCI-specific
> applications, but we should use SO_RCVBUF and SO_SNDBUF for portable
> applications in the future.  Those socket options also work with other
> address families.
>
> I guess these sockopts are bypassed by AF_VSOCK because it doesn't use
> the common skb queuing code in net/core/sock.c:(.  But one day we might
> migrate to it...
>
> Stefan


+1, we should really consider to reuse the exist socket mechanism 
instead of re-inventing wheels.

Thanks
Stefano Garzarella Oct. 14, 2019, 8:38 a.m. UTC | #32
On Mon, Oct 14, 2019 at 04:21:35PM +0800, Jason Wang wrote:
> On 2019/10/14 下午4:17, Stefan Hajnoczi wrote:
> > SO_VM_SOCKETS_BUFFER_SIZE might have been useful for VMCI-specific
> > applications, but we should use SO_RCVBUF and SO_SNDBUF for portable
> > applications in the future.  Those socket options also work with other
> > address families.
> > 

I think hyperv_transport started to use it in this patch:
ac383f58f3c9  hv_sock: perf: Allow the socket buffer size options to influence
              the actual socket buffers


> > I guess these sockopts are bypassed by AF_VSOCK because it doesn't use
> > the common skb queuing code in net/core/sock.c:(.  But one day we might
> > migrate to it...
> > 
> > Stefan
> 
> 
> +1, we should really consider to reuse the exist socket mechanism instead of
> re-inventing wheels.

+1, I totally agree. I'll go this way.

Guys, thank you all for your suggestions!

Patch
diff mbox series

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 6a50e1d0529c..6c8390a2af52 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -329,6 +329,8 @@  vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
 		return NULL;
 	}
 
+	pkt->buf_len = pkt->len;
+
 	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
 	if (nbytes != pkt->len) {
 		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index e223e2632edd..7d973903f52e 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -52,6 +52,7 @@  struct virtio_vsock_pkt {
 	/* socket refcnt not held, only use for cancellation */
 	struct vsock_sock *vsk;
 	void *buf;
+	u32 buf_len;
 	u32 len;
 	u32 off;
 	bool reply;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 0815d1357861..082a30936690 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -307,6 +307,7 @@  static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
 			break;
 		}
 
+		pkt->buf_len = buf_len;
 		pkt->len = buf_len;
 
 		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 6f1a8aff65c5..095221f94786 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -26,6 +26,9 @@ 
 /* How long to wait for graceful shutdown of a connection */
 #define VSOCK_CLOSE_TIMEOUT (8 * HZ)
 
+/* Threshold for detecting small packets to copy */
+#define GOOD_COPY_LEN  128
+
 static const struct virtio_transport *virtio_transport_get_ops(void)
 {
 	const struct vsock_transport *t = vsock_core_get_transport();
@@ -64,6 +67,9 @@  virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
 		pkt->buf = kmalloc(len, GFP_KERNEL);
 		if (!pkt->buf)
 			goto out_pkt;
+
+		pkt->buf_len = len;
+
 		err = memcpy_from_msg(pkt->buf, info->msg, len);
 		if (err)
 			goto out;
@@ -841,24 +847,60 @@  virtio_transport_recv_connecting(struct sock *sk,
 	return err;
 }
 
+static void
+virtio_transport_recv_enqueue(struct vsock_sock *vsk,
+			      struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	bool free_pkt = false;
+
+	pkt->len = le32_to_cpu(pkt->hdr.len);
+	pkt->off = 0;
+
+	spin_lock_bh(&vvs->rx_lock);
+
+	virtio_transport_inc_rx_pkt(vvs, pkt);
+
+	/* Try to copy small packets into the buffer of last packet queued,
+	 * to avoid wasting memory queueing the entire buffer with a small
+	 * payload.
+	 */
+	if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) {
+		struct virtio_vsock_pkt *last_pkt;
+
+		last_pkt = list_last_entry(&vvs->rx_queue,
+					   struct virtio_vsock_pkt, list);
+
+		/* If there is space in the last packet queued, we copy the
+		 * new packet in its buffer.
+		 */
+		if (pkt->len <= last_pkt->buf_len - last_pkt->len) {
+			memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
+			       pkt->len);
+			last_pkt->len += pkt->len;
+			free_pkt = true;
+			goto out;
+		}
+	}
+
+	list_add_tail(&pkt->list, &vvs->rx_queue);
+
+out:
+	spin_unlock_bh(&vvs->rx_lock);
+	if (free_pkt)
+		virtio_transport_free_pkt(pkt);
+}
+
 static int
 virtio_transport_recv_connected(struct sock *sk,
 				struct virtio_vsock_pkt *pkt)
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
-	struct virtio_vsock_sock *vvs = vsk->trans;
 	int err = 0;
 
 	switch (le16_to_cpu(pkt->hdr.op)) {
 	case VIRTIO_VSOCK_OP_RW:
-		pkt->len = le32_to_cpu(pkt->hdr.len);
-		pkt->off = 0;
-
-		spin_lock_bh(&vvs->rx_lock);
-		virtio_transport_inc_rx_pkt(vvs, pkt);
-		list_add_tail(&pkt->list, &vvs->rx_queue);
-		spin_unlock_bh(&vvs->rx_lock);
-
+		virtio_transport_recv_enqueue(vsk, pkt);
 		sk->sk_data_ready(sk);
 		return err;
 	case VIRTIO_VSOCK_OP_CREDIT_UPDATE: