diff mbox

kvm tools: virtio-net mergable rx buffers

Message ID 1366677147-2150-1-git-send-email-sasha.levin@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Sasha Levin April 23, 2013, 12:32 a.m. UTC
Support mergable rx buffers for virtio-net. This helps reduce the amount
of memory the guest kernel has to allocate per rx vq.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
---
 tools/kvm/include/kvm/uip.h  |  4 ++--
 tools/kvm/include/kvm/util.h |  3 +++
 tools/kvm/net/uip/core.c     | 54 +++++---------------------------------------
 tools/kvm/net/uip/tcp.c      |  2 +-
 tools/kvm/net/uip/udp.c      |  2 +-
 tools/kvm/util/util.c        | 15 ++++++++++++
 tools/kvm/virtio/net.c       | 37 ++++++++++++++++++++++--------
 7 files changed, 55 insertions(+), 62 deletions(-)

Comments

Pekka Enberg April 23, 2013, 9:06 a.m. UTC | #1
On Tue, Apr 23, 2013 at 3:32 AM, Sasha Levin <sasha.levin@oracle.com> wrote:
> @@ -94,4 +95,6 @@ struct kvm;
>  void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
>  void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
>
> +int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len);
> +

I hate the name. Maybe append_to_iovec or something?

Otherwise looks OK to me. Asias?

                        Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Asias He April 23, 2013, 2:19 p.m. UTC | #2
On Tue, Apr 23, 2013 at 12:06:56PM +0300, Pekka Enberg wrote:
> On Tue, Apr 23, 2013 at 3:32 AM, Sasha Levin <sasha.levin@oracle.com> wrote:
> > @@ -94,4 +95,6 @@ struct kvm;
> >  void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
> >  void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
> >
> > +int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len);
> > +
> 
> I hate the name. Maybe append_to_iovec or something?
> 
> Otherwise looks OK to me. Asias?

I will look into it tomorrow.

>                         Pekka
Sasha Levin April 23, 2013, 3:01 p.m. UTC | #3
(off list)

On 04/22/2013 08:32 PM, Sasha Levin wrote:
> Support mergable rx buffers for virtio-net. This helps reduce the amount
> of memory the guest kernel has to allocate per rx vq.

One of the benefits of this is that even one virtio-net device with just
one pair of vqs will require much less memory - so you could try booting
smaller guests than before!


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Northup April 23, 2013, 4:35 p.m. UTC | #4
Do you care about guests with drivers that don't negotiate
VIRTIO_NET_F_MRG_RXBUF?

On Mon, Apr 22, 2013 at 5:32 PM, Sasha Levin <sasha.levin@oracle.com> wrote:
>
> Support mergable rx buffers for virtio-net. This helps reduce the amount
> of memory the guest kernel has to allocate per rx vq.
>
> Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
> ---
>  tools/kvm/include/kvm/uip.h  |  4 ++--
>  tools/kvm/include/kvm/util.h |  3 +++
>  tools/kvm/net/uip/core.c     | 54 +++++---------------------------------------
>  tools/kvm/net/uip/tcp.c      |  2 +-
>  tools/kvm/net/uip/udp.c      |  2 +-
>  tools/kvm/util/util.c        | 15 ++++++++++++
>  tools/kvm/virtio/net.c       | 37 ++++++++++++++++++++++--------
>  7 files changed, 55 insertions(+), 62 deletions(-)
>
> diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
> index ac248d2..fa82f10 100644
> --- a/tools/kvm/include/kvm/uip.h
> +++ b/tools/kvm/include/kvm/uip.h
> @@ -252,7 +252,7 @@ struct uip_tcp_socket {
>  };
>
>  struct uip_tx_arg {
> -       struct virtio_net_hdr *vnet;
> +       struct virtio_net_hdr_mrg_rxbuf *vnet;
>         struct uip_info *info;
>         struct uip_eth *eth;
>         int vnet_len;
> @@ -332,7 +332,7 @@ static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
>  }
>
>  int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
> -int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
> +int uip_rx(unsigned char *buffer, u32 length, struct uip_info *info);
>  int uip_init(struct uip_info *info);
>
>  int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
> diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h
> index 0df9f0d..6f8ac83 100644
> --- a/tools/kvm/include/kvm/util.h
> +++ b/tools/kvm/include/kvm/util.h
> @@ -22,6 +22,7 @@
>  #include <sys/param.h>
>  #include <sys/types.h>
>  #include <linux/types.h>
> +#include <sys/uio.h>
>
>  #ifdef __GNUC__
>  #define NORETURN __attribute__((__noreturn__))
> @@ -94,4 +95,6 @@ struct kvm;
>  void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
>  void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
>
> +int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len);
> +
>  #endif /* KVM__UTIL_H */
> diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c
> index 4e5bb82..d9e9993 100644
> --- a/tools/kvm/net/uip/core.c
> +++ b/tools/kvm/net/uip/core.c
> @@ -7,7 +7,7 @@
>
>  int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
>  {
> -       struct virtio_net_hdr *vnet;
> +       struct virtio_net_hdr_mrg_rxbuf *vnet;
>         struct uip_tx_arg arg;
>         int eth_len, vnet_len;
>         struct uip_eth *eth;
> @@ -74,63 +74,21 @@ int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
>         return vnet_len + eth_len;
>  }
>
> -int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
> +int uip_rx(unsigned char *buffer, u32 length, struct uip_info *info)
>  {
> -       struct virtio_net_hdr *vnet;
> -       struct uip_eth *eth;
>         struct uip_buf *buf;
> -       int vnet_len;
> -       int eth_len;
> -       char *p;
>         int len;
> -       int cnt;
> -       int i;
>
>         /*
>          * Sleep until there is a buffer for guest
>          */
>         buf = uip_buf_get_used(info);
>
> -       /*
> -        * Fill device to guest buffer, vnet hdr fisrt
> -        */
> -       vnet_len = iov[0].iov_len;
> -       vnet = iov[0].iov_base;
> -       if (buf->vnet_len > vnet_len) {
> -               len = -1;
> -               goto out;
> -       }
> -       memcpy(vnet, buf->vnet, buf->vnet_len);
> -
> -       /*
> -        * Then, the real eth data
> -        * Note: Be sure buf->eth_len is not bigger than the buffer len that guest provides
> -        */
> -       cnt = buf->eth_len;
> -       p = buf->eth;
> -       for (i = 1; i < in; i++) {
> -               eth_len = iov[i].iov_len;
> -               eth = iov[i].iov_base;
> -               if (cnt > eth_len) {
> -                       memcpy(eth, p, eth_len);
> -                       cnt -= eth_len;
> -                       p += eth_len;
> -               } else {
> -                       memcpy(eth, p, cnt);
> -                       cnt -= cnt;
> -                       break;
> -               }
> -       }
> -
> -       if (cnt) {
> -               pr_warning("uip_rx error");
> -               len = -1;
> -               goto out;
> -       }
> +       memcpy(buffer, buf->vnet, buf->vnet_len);
> +       memcpy(buffer + buf->vnet_len, buf->eth, buf->eth_len);
>
>         len = buf->vnet_len + buf->eth_len;
>
> -out:
>         uip_buf_set_free(info, buf);
>         return len;
>  }
> @@ -172,8 +130,8 @@ int uip_init(struct uip_info *info)
>         }
>
>         list_for_each_entry(buf, buf_head, list) {
> -               buf->vnet       = malloc(sizeof(struct virtio_net_hdr));
> -               buf->vnet_len   = sizeof(struct virtio_net_hdr);
> +               buf->vnet       = malloc(sizeof(struct virtio_net_hdr_mrg_rxbuf));
> +               buf->vnet_len   = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>                 buf->eth        = malloc(1024*64 + sizeof(struct uip_pseudo_hdr));
>                 buf->eth_len    = 1024*64 + sizeof(struct uip_pseudo_hdr);
>
> diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c
> index 9044f40..a99b95e 100644
> --- a/tools/kvm/net/uip/tcp.c
> +++ b/tools/kvm/net/uip/tcp.c
> @@ -153,7 +153,7 @@ static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_
>         /*
>          * virtio_net_hdr
>          */
> -       buf->vnet_len   = sizeof(struct virtio_net_hdr);
> +       buf->vnet_len   = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>         memset(buf->vnet, 0, buf->vnet_len);
>
>         buf->eth_len    = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
> diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c
> index 31c417c..083c221 100644
> --- a/tools/kvm/net/uip/udp.c
> +++ b/tools/kvm/net/uip/udp.c
> @@ -142,7 +142,7 @@ int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct ui
>         /*
>          * virtio_net_hdr
>          */
> -       buf->vnet_len   = sizeof(struct virtio_net_hdr);
> +       buf->vnet_len   = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>         memset(buf->vnet, 0, buf->vnet_len);
>
>         buf->eth_len    = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
> diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c
> index c11a15a..4e0069c 100644
> --- a/tools/kvm/util/util.c
> +++ b/tools/kvm/util/util.c
> @@ -9,6 +9,7 @@
>  #include <sys/mman.h>
>  #include <sys/stat.h>
>  #include <sys/statfs.h>
> +#include <linux/kernel.h>
>
>  static void report(const char *prefix, const char *err, va_list params)
>  {
> @@ -131,3 +132,17 @@ void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 si
>                 return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
>         }
>  }
> +
> +int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len)
> +{
> +       size_t copy, start_len = len;
> +
> +       for (; len > 0 && iovlen > 0; ++iov, --iovlen) {
> +               copy = min(iov->iov_len, len);
> +               memcpy(iov->iov_base, kdata, copy);
> +               kdata += copy;
> +               len -= copy;
> +       }
> +
> +       return start_len - len;
> +}
> diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
> index c0a8f12..c99c5a6 100644
> --- a/tools/kvm/virtio/net.c
> +++ b/tools/kvm/virtio/net.c
> @@ -32,8 +32,8 @@
>  struct net_dev;
>
>  struct net_dev_operations {
> -       int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
> -       int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
> +       int (*rx)(unsigned char *buffer, u32 length, struct net_dev *ndev);
> +       int (*tx)(struct iovec *iov, u16 out, struct net_dev *ndev);
>  };
>
>  struct net_dev {
> @@ -63,6 +63,8 @@ struct net_dev {
>  static LIST_HEAD(ndevs);
>  static int compat_id = -1;
>
> +#define MAX_PACKET_SIZE ((u16)(-1))
> +
>  static void *virtio_net_rx_thread(void *p)
>  {
>         struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
> @@ -90,9 +92,23 @@ static void *virtio_net_rx_thread(void *p)
>                 mutex_unlock(&ndev->io_lock[id]);
>
>                 while (virt_queue__available(vq)) {
> +                       unsigned char buffer[MAX_PACKET_SIZE];
> +                       struct virtio_net_hdr_mrg_rxbuf *hdr;
> +
> +                       len = ndev->ops->rx(buffer, MAX_PACKET_SIZE, ndev);
>                         head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
> -                       len = ndev->ops->rx(iov, in, ndev);
> -                       virt_queue__set_used_elem(vq, head, len);
> +                       hdr = (void *)iov[0].iov_base;
> +                       while (len) {
> +                               int copied;
> +
> +                               copied = memcpy_toiovecend(iov, in, buffer, len);
> +                               len -= copied;
> +                               hdr->num_buffers++;
> +                               virt_queue__set_used_elem(vq, head, copied);
> +                               if (len == 0)
> +                                       break;
> +                               head = virt_queue__get_iov(vq, iov, &out, &in, kvm);

Need to check that virt_queue__available(vq) first?

> +                       }
>
>                         /* We should interrupt guest right now, otherwise latency is huge. */
>                         if (virtio_queue__should_signal(vq))
> @@ -241,7 +257,7 @@ static bool virtio_net__tap_init(const struct virtio_net_params *params,
>                 goto fail;
>         }
>
> -       hdr_len = sizeof(struct virtio_net_hdr);
> +       hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>         if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
>                 pr_warning("Config tap device TUNSETVNETHDRSZ error");
>
> @@ -300,9 +316,9 @@ static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
>         return writev(ndev->tap_fd, iov, out);
>  }
>
> -static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
> +static inline int tap_ops_rx(unsigned char *buffer, u32 length, struct net_dev *ndev)
>  {
> -       return readv(ndev->tap_fd, iov, in);
> +       return read(ndev->tap_fd, buffer, length);
>  }
>
>  static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
> @@ -310,9 +326,9 @@ static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
>         return uip_tx(iov, out, &ndev->info);
>  }
>
> -static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
> +static inline int uip_ops_rx(unsigned char *buffer, u32 length, struct net_dev *ndev)
>  {
> -       return uip_rx(iov, in, &ndev->info);
> +       return uip_rx(buffer, length, &ndev->info);
>  }
>
>  static struct net_dev_operations tap_ops = {
> @@ -347,7 +363,8 @@ static u32 get_host_features(struct kvm *kvm, void *dev)
>                 | 1UL << VIRTIO_RING_F_EVENT_IDX
>                 | 1UL << VIRTIO_RING_F_INDIRECT_DESC
>                 | 1UL << VIRTIO_NET_F_CTRL_VQ
> -               | 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
> +               | 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0)
> +               | 1UL << VIRTIO_NET_F_MRG_RXBUF;
>  }
>
>  static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
> --
> 1.8.2.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin April 24, 2013, 2:51 a.m. UTC | #5
On 04/23/2013 12:35 PM, Eric Northup wrote:
> Do you care about guests with drivers that don't negotiate
> VIRTIO_NET_F_MRG_RXBUF?

We usually try to keep backward compatibility, but in this case
mergable RX buffers are about 5 years old now, so it's safe to
assume they'll be running in any guest.

Unless there is a specific reason to allow working without them
I'd rather keep the code simple in this case.

> On Mon, Apr 22, 2013 at 5:32 PM, Sasha Levin <sasha.levin@oracle.com> wrote:
>> +                               copied = memcpy_toiovecend(iov, in, buffer, len);
>> +                               len -= copied;
>> +                               hdr->num_buffers++;
>> +                               virt_queue__set_used_elem(vq, head, copied);
>> +                               if (len == 0)
>> +                                       break;
>> +                               head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
> 
> Need to check that virt_queue__available(vq) first?

Yup. I wonder why it didn't blow up running 'ping -f' with a huge packet size.


Thanks,
Sasha

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Asias He April 24, 2013, 5:32 a.m. UTC | #6
On Mon, Apr 22, 2013 at 08:32:27PM -0400, Sasha Levin wrote:
> Support mergable rx buffers for virtio-net. This helps reduce the amount
> of memory the guest kernel has to allocate per rx vq.

With this, we always do an extra copy of the rx data into a linear buffer.
I am thinking about whether we should keep the non MRG_RXBUF code. For
tap use case, it is fine, user can use vhost. But for uip use case, we
can not use vhost.

> Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
> ---
>  tools/kvm/include/kvm/uip.h  |  4 ++--
>  tools/kvm/include/kvm/util.h |  3 +++
>  tools/kvm/net/uip/core.c     | 54 +++++---------------------------------------
>  tools/kvm/net/uip/tcp.c      |  2 +-
>  tools/kvm/net/uip/udp.c      |  2 +-
>  tools/kvm/util/util.c        | 15 ++++++++++++
>  tools/kvm/virtio/net.c       | 37 ++++++++++++++++++++++--------
>  7 files changed, 55 insertions(+), 62 deletions(-)
> 
> diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
> index ac248d2..fa82f10 100644
> --- a/tools/kvm/include/kvm/uip.h
> +++ b/tools/kvm/include/kvm/uip.h
> @@ -252,7 +252,7 @@ struct uip_tcp_socket {
>  };
>  
>  struct uip_tx_arg {
> -	struct virtio_net_hdr *vnet;
> +	struct virtio_net_hdr_mrg_rxbuf *vnet;
>  	struct uip_info *info;
>  	struct uip_eth *eth;
>  	int vnet_len;
> @@ -332,7 +332,7 @@ static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
>  }
>  
>  int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
> -int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
> +int uip_rx(unsigned char *buffer, u32 length, struct uip_info *info);
>  int uip_init(struct uip_info *info);
>  
>  int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
> diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h
> index 0df9f0d..6f8ac83 100644
> --- a/tools/kvm/include/kvm/util.h
> +++ b/tools/kvm/include/kvm/util.h
> @@ -22,6 +22,7 @@
>  #include <sys/param.h>
>  #include <sys/types.h>
>  #include <linux/types.h>
> +#include <sys/uio.h>
>  
>  #ifdef __GNUC__
>  #define NORETURN __attribute__((__noreturn__))
> @@ -94,4 +95,6 @@ struct kvm;
>  void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
>  void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
>  
> +int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len);
> +
>  #endif /* KVM__UTIL_H */
> diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c
> index 4e5bb82..d9e9993 100644
> --- a/tools/kvm/net/uip/core.c
> +++ b/tools/kvm/net/uip/core.c
> @@ -7,7 +7,7 @@
>  
>  int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
>  {
> -	struct virtio_net_hdr *vnet;
> +	struct virtio_net_hdr_mrg_rxbuf *vnet;
>  	struct uip_tx_arg arg;
>  	int eth_len, vnet_len;
>  	struct uip_eth *eth;
> @@ -74,63 +74,21 @@ int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
>  	return vnet_len + eth_len;
>  }
>  
> -int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
> +int uip_rx(unsigned char *buffer, u32 length, struct uip_info *info)
>  {
> -	struct virtio_net_hdr *vnet;
> -	struct uip_eth *eth;
>  	struct uip_buf *buf;
> -	int vnet_len;
> -	int eth_len;
> -	char *p;
>  	int len;
> -	int cnt;
> -	int i;
>  
>  	/*
>  	 * Sleep until there is a buffer for guest
>  	 */
>  	buf = uip_buf_get_used(info);
>  
> -	/*
> -	 * Fill device to guest buffer, vnet hdr fisrt
> -	 */
> -	vnet_len = iov[0].iov_len;
> -	vnet = iov[0].iov_base;
> -	if (buf->vnet_len > vnet_len) {
> -		len = -1;
> -		goto out;
> -	}
> -	memcpy(vnet, buf->vnet, buf->vnet_len);
> -
> -	/*
> -	 * Then, the real eth data
> -	 * Note: Be sure buf->eth_len is not bigger than the buffer len that guest provides
> -	 */
> -	cnt = buf->eth_len;
> -	p = buf->eth;
> -	for (i = 1; i < in; i++) {
> -		eth_len = iov[i].iov_len;
> -		eth = iov[i].iov_base;
> -		if (cnt > eth_len) {
> -			memcpy(eth, p, eth_len);
> -			cnt -= eth_len;
> -			p += eth_len;
> -		} else {
> -			memcpy(eth, p, cnt);
> -			cnt -= cnt;
> -			break;
> -		}
> -	}
> -
> -	if (cnt) {
> -		pr_warning("uip_rx error");
> -		len = -1;
> -		goto out;
> -	}
> +	memcpy(buffer, buf->vnet, buf->vnet_len);
> +	memcpy(buffer + buf->vnet_len, buf->eth, buf->eth_len);
>  
>  	len = buf->vnet_len + buf->eth_len;
>  
> -out:
>  	uip_buf_set_free(info, buf);
>  	return len;
>  }
> @@ -172,8 +130,8 @@ int uip_init(struct uip_info *info)
>  	}
>  
>  	list_for_each_entry(buf, buf_head, list) {
> -		buf->vnet	= malloc(sizeof(struct virtio_net_hdr));
> -		buf->vnet_len	= sizeof(struct virtio_net_hdr);
> +		buf->vnet	= malloc(sizeof(struct virtio_net_hdr_mrg_rxbuf));
> +		buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
>  		buf->eth	= malloc(1024*64 + sizeof(struct uip_pseudo_hdr));
>  		buf->eth_len	= 1024*64 + sizeof(struct uip_pseudo_hdr);
>  
> diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c
> index 9044f40..a99b95e 100644
> --- a/tools/kvm/net/uip/tcp.c
> +++ b/tools/kvm/net/uip/tcp.c
> @@ -153,7 +153,7 @@ static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_
>  	/*
>  	 * virtio_net_hdr
>  	 */
> -	buf->vnet_len	= sizeof(struct virtio_net_hdr);
> +	buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
>  	memset(buf->vnet, 0, buf->vnet_len);
>  
>  	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
> diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c
> index 31c417c..083c221 100644
> --- a/tools/kvm/net/uip/udp.c
> +++ b/tools/kvm/net/uip/udp.c
> @@ -142,7 +142,7 @@ int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct ui
>  	/*
>  	 * virtio_net_hdr
>  	 */
> -	buf->vnet_len	= sizeof(struct virtio_net_hdr);
> +	buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
>  	memset(buf->vnet, 0, buf->vnet_len);
>  
>  	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
> diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c
> index c11a15a..4e0069c 100644
> --- a/tools/kvm/util/util.c
> +++ b/tools/kvm/util/util.c
> @@ -9,6 +9,7 @@
>  #include <sys/mman.h>
>  #include <sys/stat.h>
>  #include <sys/statfs.h>
> +#include <linux/kernel.h>
>  
>  static void report(const char *prefix, const char *err, va_list params)
>  {
> @@ -131,3 +132,17 @@ void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 si
>  		return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
>  	}
>  }
> +
> +int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len)
> +{
> +	size_t copy, start_len = len;
> +
> +	for (; len > 0 && iovlen > 0; ++iov, --iovlen) {
> +		copy = min(iov->iov_len, len);
> +		memcpy(iov->iov_base, kdata, copy);
> +		kdata += copy;
> +		len -= copy;
> +	}
> +
> +	return start_len - len;
> +}
> diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
> index c0a8f12..c99c5a6 100644
> --- a/tools/kvm/virtio/net.c
> +++ b/tools/kvm/virtio/net.c
> @@ -32,8 +32,8 @@
>  struct net_dev;
>  
>  struct net_dev_operations {
> -	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
> -	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
> +	int (*rx)(unsigned char *buffer, u32 length, struct net_dev *ndev);
> +	int (*tx)(struct iovec *iov, u16 out, struct net_dev *ndev);
>  };
>  
>  struct net_dev {
> @@ -63,6 +63,8 @@ struct net_dev {
>  static LIST_HEAD(ndevs);
>  static int compat_id = -1;
>  
> +#define MAX_PACKET_SIZE ((u16)(-1))

Why 64KB - 1? The package can be 65550 bytes which is max TCP or UDP
plus 14 bytes eth header.

> +
>  static void *virtio_net_rx_thread(void *p)
>  {
>  	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
> @@ -90,9 +92,23 @@ static void *virtio_net_rx_thread(void *p)
>  		mutex_unlock(&ndev->io_lock[id]);
>  
>  		while (virt_queue__available(vq)) {
> +			unsigned char buffer[MAX_PACKET_SIZE];

The buffer size should at least be:

MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)

> +			struct virtio_net_hdr_mrg_rxbuf *hdr;
> +
> +			len = ndev->ops->rx(buffer, MAX_PACKET_SIZE, ndev);
>  			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
> -			len = ndev->ops->rx(iov, in, ndev);
> -			virt_queue__set_used_elem(vq, head, len);
> +			hdr = (void *)iov[0].iov_base;
> +			while (len) {
> +				int copied;
> +
> +				copied = memcpy_toiovecend(iov, in, buffer, len);
> +				len -= copied;
> +				hdr->num_buffers++;
> +				virt_queue__set_used_elem(vq, head, copied);
> +				if (len == 0)
> +					break;
> +				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);

Here, need to check if we still have available buffer from guest. If not
need to wait for it.

> +			}
>  
>  			/* We should interrupt guest right now, otherwise latency is huge. */
>  			if (virtio_queue__should_signal(vq))
> @@ -241,7 +257,7 @@ static bool virtio_net__tap_init(const struct virtio_net_params *params,
>  		goto fail;
>  	}
>  
> -	hdr_len = sizeof(struct virtio_net_hdr);
> +	hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>  	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
>  		pr_warning("Config tap device TUNSETVNETHDRSZ error");
>  
> @@ -300,9 +316,9 @@ static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
>  	return writev(ndev->tap_fd, iov, out);
>  }
>  
> -static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
> +static inline int tap_ops_rx(unsigned char *buffer, u32 length, struct net_dev *ndev)
>  {
> -	return readv(ndev->tap_fd, iov, in);
> +	return read(ndev->tap_fd, buffer, length);
>  }
>  
>  static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
> @@ -310,9 +326,9 @@ static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
>  	return uip_tx(iov, out, &ndev->info);
>  }
>  
> -static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
> +static inline int uip_ops_rx(unsigned char *buffer, u32 length, struct net_dev *ndev)
>  {
> -	return uip_rx(iov, in, &ndev->info);
> +	return uip_rx(buffer, length, &ndev->info);
>  }
>  
>  static struct net_dev_operations tap_ops = {
> @@ -347,7 +363,8 @@ static u32 get_host_features(struct kvm *kvm, void *dev)
>  		| 1UL << VIRTIO_RING_F_EVENT_IDX
>  		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
>  		| 1UL << VIRTIO_NET_F_CTRL_VQ
> -		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
> +		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0)
> +		| 1UL << VIRTIO_NET_F_MRG_RXBUF;
>  }
>  
>  static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
> -- 
> 1.8.2.1
>
Pekka Enberg April 24, 2013, 6:51 a.m. UTC | #7
Hi,

On 04/23/2013 12:35 PM, Eric Northup wrote:
>> Do you care about guests with drivers that don't negotiate
>> VIRTIO_NET_F_MRG_RXBUF?

On Wed, Apr 24, 2013 at 5:51 AM, Sasha Levin <sasha.levin@oracle.com> wrote:
> We usually try to keep backward compatibility, but in this case
> mergable RX buffers are about 5 years old now, so it's safe to
> assume they'll be running in any guest.
>
> Unless there is a specific reason to allow working without them
> I'd rather keep the code simple in this case.

Are there such guests around? What's the failure scenario for them
after this patch?

                        Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin April 24, 2013, 9:23 a.m. UTC | #8
On Wed, Apr 24, 2013 at 09:51:57AM +0300, Pekka Enberg wrote:
> Hi,
> 
> On 04/23/2013 12:35 PM, Eric Northup wrote:
> >> Do you care about guests with drivers that don't negotiate
> >> VIRTIO_NET_F_MRG_RXBUF?
> 
> On Wed, Apr 24, 2013 at 5:51 AM, Sasha Levin <sasha.levin@oracle.com> wrote:
> > We usually try to keep backward compatibility, but in this case
> > mergable RX buffers are about 5 years old now, so it's safe to
> > assume they'll be running in any guest.
> >
> > Unless there is a specific reason to allow working without them
> > I'd rather keep the code simple in this case.
> 
> Are there such guests around? What's the failure scenario for them
> after this patch?
> 
>                         Pekka

Warning: have not looked at the patch, just a general comment.

I think it's reasonable to assume embedded guests such as PXE won't
negotiate any features.  And, running old guests is one of the reasons
people use virtualization at all. So 5 years is not a lot.

In any case, stick to the device spec please, if you want it changed
please send a spec patch, don't deviate from it randomly.
Michael S. Tsirkin April 24, 2013, 9:35 a.m. UTC | #9
On Wed, Apr 24, 2013 at 01:32:56PM +0800, Asias He wrote:
> On Mon, Apr 22, 2013 at 08:32:27PM -0400, Sasha Levin wrote:
> > Support mergable rx buffers for virtio-net. This helps reduce the amount
> > of memory the guest kernel has to allocate per rx vq.
> 
> With this, we always do an extra copy of the rx data into a linear buffer.
> I am thinking about whether we should keep the non MRG_RXBUF code. For
> tap use case, it is fine, user can use vhost. But for uip use case, we
> can not use vhost.

You have to keep it but it might be ok not to optimize it.

> > Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
> > ---
> >  tools/kvm/include/kvm/uip.h  |  4 ++--
> >  tools/kvm/include/kvm/util.h |  3 +++
> >  tools/kvm/net/uip/core.c     | 54 +++++---------------------------------------
> >  tools/kvm/net/uip/tcp.c      |  2 +-
> >  tools/kvm/net/uip/udp.c      |  2 +-
> >  tools/kvm/util/util.c        | 15 ++++++++++++
> >  tools/kvm/virtio/net.c       | 37 ++++++++++++++++++++++--------
> >  7 files changed, 55 insertions(+), 62 deletions(-)
> > 
> > diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
> > index ac248d2..fa82f10 100644
> > --- a/tools/kvm/include/kvm/uip.h
> > +++ b/tools/kvm/include/kvm/uip.h
> > @@ -252,7 +252,7 @@ struct uip_tcp_socket {
> >  };
> >  
> >  struct uip_tx_arg {
> > -	struct virtio_net_hdr *vnet;
> > +	struct virtio_net_hdr_mrg_rxbuf *vnet;
> >  	struct uip_info *info;
> >  	struct uip_eth *eth;
> >  	int vnet_len;
> > @@ -332,7 +332,7 @@ static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
> >  }
> >  
> >  int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
> > -int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
> > +int uip_rx(unsigned char *buffer, u32 length, struct uip_info *info);
> >  int uip_init(struct uip_info *info);
> >  
> >  int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
> > diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h
> > index 0df9f0d..6f8ac83 100644
> > --- a/tools/kvm/include/kvm/util.h
> > +++ b/tools/kvm/include/kvm/util.h
> > @@ -22,6 +22,7 @@
> >  #include <sys/param.h>
> >  #include <sys/types.h>
> >  #include <linux/types.h>
> > +#include <sys/uio.h>
> >  
> >  #ifdef __GNUC__
> >  #define NORETURN __attribute__((__noreturn__))
> > @@ -94,4 +95,6 @@ struct kvm;
> >  void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
> >  void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
> >  
> > +int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len);
> > +
> >  #endif /* KVM__UTIL_H */
> > diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c
> > index 4e5bb82..d9e9993 100644
> > --- a/tools/kvm/net/uip/core.c
> > +++ b/tools/kvm/net/uip/core.c
> > @@ -7,7 +7,7 @@
> >  
> >  int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
> >  {
> > -	struct virtio_net_hdr *vnet;
> > +	struct virtio_net_hdr_mrg_rxbuf *vnet;
> >  	struct uip_tx_arg arg;
> >  	int eth_len, vnet_len;
> >  	struct uip_eth *eth;
> > @@ -74,63 +74,21 @@ int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
> >  	return vnet_len + eth_len;
> >  }
> >  
> > -int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
> > +int uip_rx(unsigned char *buffer, u32 length, struct uip_info *info)
> >  {
> > -	struct virtio_net_hdr *vnet;
> > -	struct uip_eth *eth;
> >  	struct uip_buf *buf;
> > -	int vnet_len;
> > -	int eth_len;
> > -	char *p;
> >  	int len;
> > -	int cnt;
> > -	int i;
> >  
> >  	/*
> >  	 * Sleep until there is a buffer for guest
> >  	 */
> >  	buf = uip_buf_get_used(info);
> >  
> > -	/*
> > -	 * Fill device to guest buffer, vnet hdr fisrt
> > -	 */
> > -	vnet_len = iov[0].iov_len;
> > -	vnet = iov[0].iov_base;
> > -	if (buf->vnet_len > vnet_len) {
> > -		len = -1;
> > -		goto out;
> > -	}
> > -	memcpy(vnet, buf->vnet, buf->vnet_len);
> > -
> > -	/*
> > -	 * Then, the real eth data
> > -	 * Note: Be sure buf->eth_len is not bigger than the buffer len that guest provides
> > -	 */
> > -	cnt = buf->eth_len;
> > -	p = buf->eth;
> > -	for (i = 1; i < in; i++) {
> > -		eth_len = iov[i].iov_len;
> > -		eth = iov[i].iov_base;
> > -		if (cnt > eth_len) {
> > -			memcpy(eth, p, eth_len);
> > -			cnt -= eth_len;
> > -			p += eth_len;
> > -		} else {
> > -			memcpy(eth, p, cnt);
> > -			cnt -= cnt;
> > -			break;
> > -		}
> > -	}
> > -
> > -	if (cnt) {
> > -		pr_warning("uip_rx error");
> > -		len = -1;
> > -		goto out;
> > -	}
> > +	memcpy(buffer, buf->vnet, buf->vnet_len);
> > +	memcpy(buffer + buf->vnet_len, buf->eth, buf->eth_len);
> >  
> >  	len = buf->vnet_len + buf->eth_len;
> >  
> > -out:
> >  	uip_buf_set_free(info, buf);
> >  	return len;
> >  }
> > @@ -172,8 +130,8 @@ int uip_init(struct uip_info *info)
> >  	}
> >  
> >  	list_for_each_entry(buf, buf_head, list) {
> > -		buf->vnet	= malloc(sizeof(struct virtio_net_hdr));
> > -		buf->vnet_len	= sizeof(struct virtio_net_hdr);
> > +		buf->vnet	= malloc(sizeof(struct virtio_net_hdr_mrg_rxbuf));
> > +		buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> >  		buf->eth	= malloc(1024*64 + sizeof(struct uip_pseudo_hdr));
> >  		buf->eth_len	= 1024*64 + sizeof(struct uip_pseudo_hdr);
> >  
> > diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c
> > index 9044f40..a99b95e 100644
> > --- a/tools/kvm/net/uip/tcp.c
> > +++ b/tools/kvm/net/uip/tcp.c
> > @@ -153,7 +153,7 @@ static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_
> >  	/*
> >  	 * virtio_net_hdr
> >  	 */
> > -	buf->vnet_len	= sizeof(struct virtio_net_hdr);
> > +	buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> >  	memset(buf->vnet, 0, buf->vnet_len);
> >  
> >  	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
> > diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c
> > index 31c417c..083c221 100644
> > --- a/tools/kvm/net/uip/udp.c
> > +++ b/tools/kvm/net/uip/udp.c
> > @@ -142,7 +142,7 @@ int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct ui
> >  	/*
> >  	 * virtio_net_hdr
> >  	 */
> > -	buf->vnet_len	= sizeof(struct virtio_net_hdr);
> > +	buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> >  	memset(buf->vnet, 0, buf->vnet_len);
> >  
> >  	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
> > diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c
> > index c11a15a..4e0069c 100644
> > --- a/tools/kvm/util/util.c
> > +++ b/tools/kvm/util/util.c
> > @@ -9,6 +9,7 @@
> >  #include <sys/mman.h>
> >  #include <sys/stat.h>
> >  #include <sys/statfs.h>
> > +#include <linux/kernel.h>
> >  
> >  static void report(const char *prefix, const char *err, va_list params)
> >  {
> > @@ -131,3 +132,17 @@ void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 si
> >  		return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
> >  	}
> >  }
> > +
> > +int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len)
> > +{
> > +	size_t copy, start_len = len;
> > +
> > +	for (; len > 0 && iovlen > 0; ++iov, --iovlen) {
> > +		copy = min(iov->iov_len, len);
> > +		memcpy(iov->iov_base, kdata, copy);
> > +		kdata += copy;
> > +		len -= copy;
> > +	}
> > +
> > +	return start_len - len;
> > +}
> > diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
> > index c0a8f12..c99c5a6 100644
> > --- a/tools/kvm/virtio/net.c
> > +++ b/tools/kvm/virtio/net.c
> > @@ -32,8 +32,8 @@
> >  struct net_dev;
> >  
> >  struct net_dev_operations {
> > -	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
> > -	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
> > +	int (*rx)(unsigned char *buffer, u32 length, struct net_dev *ndev);
> > +	int (*tx)(struct iovec *iov, u16 out, struct net_dev *ndev);
> >  };
> >  
> >  struct net_dev {
> > @@ -63,6 +63,8 @@ struct net_dev {
> >  static LIST_HEAD(ndevs);
> >  static int compat_id = -1;
> >  
> > +#define MAX_PACKET_SIZE ((u16)(-1))
> 
> Why 64KB - 1? The package can be 65550 bytes which is max TCP or UDP
> plus 14 bytes eth header.
> 
> > +
> >  static void *virtio_net_rx_thread(void *p)
> >  {
> >  	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
> > @@ -90,9 +92,23 @@ static void *virtio_net_rx_thread(void *p)
> >  		mutex_unlock(&ndev->io_lock[id]);
> >  
> >  		while (virt_queue__available(vq)) {
> > +			unsigned char buffer[MAX_PACKET_SIZE];
> 
> The buffer size should at least be:
> 
> MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)
> 
> > +			struct virtio_net_hdr_mrg_rxbuf *hdr;
> > +
> > +			len = ndev->ops->rx(buffer, MAX_PACKET_SIZE, ndev);
> >  			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
> > -			len = ndev->ops->rx(iov, in, ndev);
> > -			virt_queue__set_used_elem(vq, head, len);
> > +			hdr = (void *)iov[0].iov_base;
> > +			while (len) {
> > +				int copied;
> > +
> > +				copied = memcpy_toiovecend(iov, in, buffer, len);
> > +				len -= copied;
> > +				hdr->num_buffers++;
> > +				virt_queue__set_used_elem(vq, head, copied);
> > +				if (len == 0)
> > +					break;
> > +				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
> 
> Here, need to check if we still have available buffer from guest. If not
> need to wait for it.
> 
> > +			}
> >  
> >  			/* We should interrupt guest right now, otherwise latency is huge. */
> >  			if (virtio_queue__should_signal(vq))
> > @@ -241,7 +257,7 @@ static bool virtio_net__tap_init(const struct virtio_net_params *params,
> >  		goto fail;
> >  	}
> >  
> > -	hdr_len = sizeof(struct virtio_net_hdr);
> > +	hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> >  	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
> >  		pr_warning("Config tap device TUNSETVNETHDRSZ error");
> >  
> > @@ -300,9 +316,9 @@ static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
> >  	return writev(ndev->tap_fd, iov, out);
> >  }
> >  
> > -static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
> > +static inline int tap_ops_rx(unsigned char *buffer, u32 length, struct net_dev *ndev)
> >  {
> > -	return readv(ndev->tap_fd, iov, in);
> > +	return read(ndev->tap_fd, buffer, length);
> >  }
> >  
> >  static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
> > @@ -310,9 +326,9 @@ static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
> >  	return uip_tx(iov, out, &ndev->info);
> >  }
> >  
> > -static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
> > +static inline int uip_ops_rx(unsigned char *buffer, u32 length, struct net_dev *ndev)
> >  {
> > -	return uip_rx(iov, in, &ndev->info);
> > +	return uip_rx(buffer, length, &ndev->info);
> >  }
> >  
> >  static struct net_dev_operations tap_ops = {
> > @@ -347,7 +363,8 @@ static u32 get_host_features(struct kvm *kvm, void *dev)
> >  		| 1UL << VIRTIO_RING_F_EVENT_IDX
> >  		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
> >  		| 1UL << VIRTIO_NET_F_CTRL_VQ
> > -		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
> > +		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0)
> > +		| 1UL << VIRTIO_NET_F_MRG_RXBUF;
> >  }
> >  
> >  static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
> > -- 
> > 1.8.2.1
> > 
> 
> -- 
> Asias
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell April 29, 2013, 12:44 a.m. UTC | #10
"Michael S. Tsirkin" <mst@redhat.com> writes:
> On Wed, Apr 24, 2013 at 09:51:57AM +0300, Pekka Enberg wrote:
>> Hi,
>> 
>> On 04/23/2013 12:35 PM, Eric Northup wrote:
>> >> Do you care about guests with drivers that don't negotiate
>> >> VIRTIO_NET_F_MRG_RXBUF?
>> 
>> On Wed, Apr 24, 2013 at 5:51 AM, Sasha Levin <sasha.levin@oracle.com> wrote:
>> > We usually try to keep backward compatibility, but in this case
>> > mergable RX buffers are about 5 years old now, so it's safe to
>> > assume they'll be running in any guest.
>> >
>> > Unless there is a specific reason to allow working without them
>> > I'd rather keep the code simple in this case.
>> 
>> Are there such guests around? What's the failure scenario for them
>> after this patch?
>> 
>>                         Pekka
>
> Warning: have not looked at the patch, just a general comment.
>
> I think it's reasonable to assume embedded guests such as PXE won't
> negotiate any features.  And, running old guests is one of the reasons
> people use virtualization at all. So 5 years is not a lot.
>
> In any case, stick to the device spec please, if you want it changed
> please send a spec patch, don't deviate from it randomly.

Supporting old guests is an quality of implementation issue.  It's like
any ABI: if noone will notice, you can remove stuff.

But the case of "I can receive GSO packets but I don't support mergeable
buffers" is a trivial one: you can "support" it by pretending the guest
can't handle GSO :)

If you want to support non-Linux guests (eg. bootloaders), you probably
want to keep support for very dumb drivers with no mergable rxbufs
though.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sasha Levin April 29, 2013, 2:36 a.m. UTC | #11
On 04/28/2013 08:44 PM, Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> > On Wed, Apr 24, 2013 at 09:51:57AM +0300, Pekka Enberg wrote:
>>> >> Hi,
>>> >> 
>>> >> On 04/23/2013 12:35 PM, Eric Northup wrote:
>>>>> >> >> Do you care about guests with drivers that don't negotiate
>>>>> >> >> VIRTIO_NET_F_MRG_RXBUF?
>>> >> 
>>> >> On Wed, Apr 24, 2013 at 5:51 AM, Sasha Levin <sasha.levin@oracle.com> wrote:
>>>> >> > We usually try to keep backward compatibility, but in this case
>>>> >> > mergable RX buffers are about 5 years old now, so it's safe to
>>>> >> > assume they'll be running in any guest.
>>>> >> >
>>>> >> > Unless there is a specific reason to allow working without them
>>>> >> > I'd rather keep the code simple in this case.
>>> >> 
>>> >> Are there such guests around? What's the failure scenario for them
>>> >> after this patch?
>>> >> 
>>> >>                         Pekka
>> >
>> > Warning: have not looked at the patch, just a general comment.
>> >
>> > I think it's reasonable to assume embedded guests such as PXE won't
>> > negotiate any features.  And, running old guests is one of the reasons
>> > people use virtualization at all. So 5 years is not a lot.
>> >
>> > In any case, stick to the device spec please, if you want it changed
>> > please send a spec patch, don't deviate from it randomly.
> Supporting old guests is an quality of implementation issue.  It's like
> any ABI: if noone will notice, you can remove stuff.
> 
> But the case of "I can receive GSO packets but I don't support mergeable
> buffers" is a trivial one: you can "support" it by pretending the guest
> can't handle GSO :)
> 
> If you want to support non-Linux guests (eg. bootloaders), you probably
> want to keep support for very dumb drivers with no mergable rxbufs
> though.

Yup, I'm planning on sending a version that supports older guests soonish.


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
index ac248d2..fa82f10 100644
--- a/tools/kvm/include/kvm/uip.h
+++ b/tools/kvm/include/kvm/uip.h
@@ -252,7 +252,7 @@  struct uip_tcp_socket {
 };
 
 struct uip_tx_arg {
-	struct virtio_net_hdr *vnet;
+	struct virtio_net_hdr_mrg_rxbuf *vnet;
 	struct uip_info *info;
 	struct uip_eth *eth;
 	int vnet_len;
@@ -332,7 +332,7 @@  static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
 }
 
 int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
-int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
+int uip_rx(unsigned char *buffer, u32 length, struct uip_info *info);
 int uip_init(struct uip_info *info);
 
 int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h
index 0df9f0d..6f8ac83 100644
--- a/tools/kvm/include/kvm/util.h
+++ b/tools/kvm/include/kvm/util.h
@@ -22,6 +22,7 @@ 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <linux/types.h>
+#include <sys/uio.h>
 
 #ifdef __GNUC__
 #define NORETURN __attribute__((__noreturn__))
@@ -94,4 +95,6 @@  struct kvm;
 void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
 void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
 
+int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len);
+
 #endif /* KVM__UTIL_H */
diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c
index 4e5bb82..d9e9993 100644
--- a/tools/kvm/net/uip/core.c
+++ b/tools/kvm/net/uip/core.c
@@ -7,7 +7,7 @@ 
 
 int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
 {
-	struct virtio_net_hdr *vnet;
+	struct virtio_net_hdr_mrg_rxbuf *vnet;
 	struct uip_tx_arg arg;
 	int eth_len, vnet_len;
 	struct uip_eth *eth;
@@ -74,63 +74,21 @@  int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
 	return vnet_len + eth_len;
 }
 
-int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
+int uip_rx(unsigned char *buffer, u32 length, struct uip_info *info)
 {
-	struct virtio_net_hdr *vnet;
-	struct uip_eth *eth;
 	struct uip_buf *buf;
-	int vnet_len;
-	int eth_len;
-	char *p;
 	int len;
-	int cnt;
-	int i;
 
 	/*
 	 * Sleep until there is a buffer for guest
 	 */
 	buf = uip_buf_get_used(info);
 
-	/*
-	 * Fill device to guest buffer, vnet hdr fisrt
-	 */
-	vnet_len = iov[0].iov_len;
-	vnet = iov[0].iov_base;
-	if (buf->vnet_len > vnet_len) {
-		len = -1;
-		goto out;
-	}
-	memcpy(vnet, buf->vnet, buf->vnet_len);
-
-	/*
-	 * Then, the real eth data
-	 * Note: Be sure buf->eth_len is not bigger than the buffer len that guest provides
-	 */
-	cnt = buf->eth_len;
-	p = buf->eth;
-	for (i = 1; i < in; i++) {
-		eth_len = iov[i].iov_len;
-		eth = iov[i].iov_base;
-		if (cnt > eth_len) {
-			memcpy(eth, p, eth_len);
-			cnt -= eth_len;
-			p += eth_len;
-		} else {
-			memcpy(eth, p, cnt);
-			cnt -= cnt;
-			break;
-		}
-	}
-
-	if (cnt) {
-		pr_warning("uip_rx error");
-		len = -1;
-		goto out;
-	}
+	memcpy(buffer, buf->vnet, buf->vnet_len);
+	memcpy(buffer + buf->vnet_len, buf->eth, buf->eth_len);
 
 	len = buf->vnet_len + buf->eth_len;
 
-out:
 	uip_buf_set_free(info, buf);
 	return len;
 }
@@ -172,8 +130,8 @@  int uip_init(struct uip_info *info)
 	}
 
 	list_for_each_entry(buf, buf_head, list) {
-		buf->vnet	= malloc(sizeof(struct virtio_net_hdr));
-		buf->vnet_len	= sizeof(struct virtio_net_hdr);
+		buf->vnet	= malloc(sizeof(struct virtio_net_hdr_mrg_rxbuf));
+		buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
 		buf->eth	= malloc(1024*64 + sizeof(struct uip_pseudo_hdr));
 		buf->eth_len	= 1024*64 + sizeof(struct uip_pseudo_hdr);
 
diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c
index 9044f40..a99b95e 100644
--- a/tools/kvm/net/uip/tcp.c
+++ b/tools/kvm/net/uip/tcp.c
@@ -153,7 +153,7 @@  static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_
 	/*
 	 * virtio_net_hdr
 	 */
-	buf->vnet_len	= sizeof(struct virtio_net_hdr);
+	buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	memset(buf->vnet, 0, buf->vnet_len);
 
 	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c
index 31c417c..083c221 100644
--- a/tools/kvm/net/uip/udp.c
+++ b/tools/kvm/net/uip/udp.c
@@ -142,7 +142,7 @@  int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct ui
 	/*
 	 * virtio_net_hdr
 	 */
-	buf->vnet_len	= sizeof(struct virtio_net_hdr);
+	buf->vnet_len	= sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	memset(buf->vnet, 0, buf->vnet_len);
 
 	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c
index c11a15a..4e0069c 100644
--- a/tools/kvm/util/util.c
+++ b/tools/kvm/util/util.c
@@ -9,6 +9,7 @@ 
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/statfs.h>
+#include <linux/kernel.h>
 
 static void report(const char *prefix, const char *err, va_list params)
 {
@@ -131,3 +132,17 @@  void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 si
 		return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
 	}
 }
+
+int memcpy_toiovecend(const struct iovec *iov, int iovlen, unsigned char *kdata, size_t len)
+{
+	size_t copy, start_len = len;
+
+	for (; len > 0 && iovlen > 0; ++iov, --iovlen) {
+		copy = min(iov->iov_len, len);
+		memcpy(iov->iov_base, kdata, copy);
+		kdata += copy;
+		len -= copy;
+	}
+
+	return start_len - len;
+}
diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
index c0a8f12..c99c5a6 100644
--- a/tools/kvm/virtio/net.c
+++ b/tools/kvm/virtio/net.c
@@ -32,8 +32,8 @@ 
 struct net_dev;
 
 struct net_dev_operations {
-	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
-	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+	int (*rx)(unsigned char *buffer, u32 length, struct net_dev *ndev);
+	int (*tx)(struct iovec *iov, u16 out, struct net_dev *ndev);
 };
 
 struct net_dev {
@@ -63,6 +63,8 @@  struct net_dev {
 static LIST_HEAD(ndevs);
 static int compat_id = -1;
 
+#define MAX_PACKET_SIZE ((u16)(-1))
+
 static void *virtio_net_rx_thread(void *p)
 {
 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
@@ -90,9 +92,23 @@  static void *virtio_net_rx_thread(void *p)
 		mutex_unlock(&ndev->io_lock[id]);
 
 		while (virt_queue__available(vq)) {
+			unsigned char buffer[MAX_PACKET_SIZE];
+			struct virtio_net_hdr_mrg_rxbuf *hdr;
+
+			len = ndev->ops->rx(buffer, MAX_PACKET_SIZE, ndev);
 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
-			len = ndev->ops->rx(iov, in, ndev);
-			virt_queue__set_used_elem(vq, head, len);
+			hdr = (void *)iov[0].iov_base;
+			while (len) {
+				int copied;
+
+				copied = memcpy_toiovecend(iov, in, buffer, len);
+				len -= copied;
+				hdr->num_buffers++;
+				virt_queue__set_used_elem(vq, head, copied);
+				if (len == 0)
+					break;
+				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			}
 
 			/* We should interrupt guest right now, otherwise latency is huge. */
 			if (virtio_queue__should_signal(vq))
@@ -241,7 +257,7 @@  static bool virtio_net__tap_init(const struct virtio_net_params *params,
 		goto fail;
 	}
 
-	hdr_len = sizeof(struct virtio_net_hdr);
+	hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
 		pr_warning("Config tap device TUNSETVNETHDRSZ error");
 
@@ -300,9 +316,9 @@  static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
 	return writev(ndev->tap_fd, iov, out);
 }
 
-static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+static inline int tap_ops_rx(unsigned char *buffer, u32 length, struct net_dev *ndev)
 {
-	return readv(ndev->tap_fd, iov, in);
+	return read(ndev->tap_fd, buffer, length);
 }
 
 static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
@@ -310,9 +326,9 @@  static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
 	return uip_tx(iov, out, &ndev->info);
 }
 
-static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+static inline int uip_ops_rx(unsigned char *buffer, u32 length, struct net_dev *ndev)
 {
-	return uip_rx(iov, in, &ndev->info);
+	return uip_rx(buffer, length, &ndev->info);
 }
 
 static struct net_dev_operations tap_ops = {
@@ -347,7 +363,8 @@  static u32 get_host_features(struct kvm *kvm, void *dev)
 		| 1UL << VIRTIO_RING_F_EVENT_IDX
 		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
 		| 1UL << VIRTIO_NET_F_CTRL_VQ
-		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
+		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0)
+		| 1UL << VIRTIO_NET_F_MRG_RXBUF;
 }
 
 static void set_guest_features(struct kvm *kvm, void *dev, u32 features)