diff mbox series

[v5,3/6] QIOChannelSocket: Implement io_writev_zerocopy & io_flush_zerocopy for CONFIG_LINUX

Message ID 20211112051040.923746-4-leobras@redhat.com (mailing list archive)
State New, archived
Headers show
Series MSG_ZEROCOPY + multifd | expand

Commit Message

Leonardo Bras Nov. 12, 2021, 5:10 a.m. UTC
For CONFIG_LINUX, implement the new optional callbacks io_write_zerocopy and
io_flush_zerocopy on QIOChannelSocket, but enables it only when MSG_ZEROCOPY
feature is available in the host kernel, which is checked on
qio_channel_socket_connect_sync()

qio_channel_socket_flush_zerocopy() was implemented by counting how many times
sendmsg(...,MSG_ZEROCOPY) was successfully called, and then reading the
socket's error queue, in order to find how many of them finished sending.
Flush will loop until those counters are the same, or until some error occurs.

A new function qio_channel_socket_poll() was also created in order to avoid
busy-looping recvmsg() in qio_channel_socket_flush_zerocopy() while waiting for
updates in socket's error queue.

Notes on using writev_zerocopy():
1: Buffer
- As MSG_ZEROCOPY tells the kernel to use the same user buffer to avoid copying,
some caution is necessary to avoid overwriting any buffer before it's sent.
If something like this happen, a newer version of the buffer may be sent instead.
- If this is a problem, it's recommended to call flush_zerocopy() before freeing
or re-using the buffer.

2: Locked memory
- When using MSG_ZERCOCOPY, the buffer memory will be locked after queued, and
unlocked after it's sent.
- Depending on the size of each buffer, and how often it's sent, it may require
a larger amount of locked memory than usually available to non-root user.
- If the required amount of locked memory is not available, writev_zerocopy
will return an error, which can abort an operation like migration,
- Because of this, when an user code wants to add zerocopy as a feature, it
requires a mechanism to disable it, so it can still be accessible to less
privileged users.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
---
 include/io/channel-socket.h |   2 +
 include/io/channel.h        |   1 +
 io/channel-socket.c         | 150 +++++++++++++++++++++++++++++++++++-
 3 files changed, 150 insertions(+), 3 deletions(-)

Comments

Daniel P. Berrangé Nov. 12, 2021, 10:54 a.m. UTC | #1
On Fri, Nov 12, 2021 at 02:10:38AM -0300, Leonardo Bras wrote:
> For CONFIG_LINUX, implement the new optional callbacks io_write_zerocopy and
> io_flush_zerocopy on QIOChannelSocket, but enables it only when MSG_ZEROCOPY
> feature is available in the host kernel, which is checked on
> qio_channel_socket_connect_sync()
> 
> qio_channel_socket_flush_zerocopy() was implemented by counting how many times
> sendmsg(...,MSG_ZEROCOPY) was successfully called, and then reading the
> socket's error queue, in order to find how many of them finished sending.
> Flush will loop until those counters are the same, or until some error occurs.
> 
> A new function qio_channel_socket_poll() was also created in order to avoid
> busy-looping recvmsg() in qio_channel_socket_flush_zerocopy() while waiting for
> updates in socket's error queue.
> 
> Notes on using writev_zerocopy():
> 1: Buffer
> - As MSG_ZEROCOPY tells the kernel to use the same user buffer to avoid copying,
> some caution is necessary to avoid overwriting any buffer before it's sent.
> If something like this happen, a newer version of the buffer may be sent instead.
> - If this is a problem, it's recommended to call flush_zerocopy() before freeing
> or re-using the buffer.
> 
> 2: Locked memory
> - When using MSG_ZERCOCOPY, the buffer memory will be locked after queued, and
> unlocked after it's sent.
> - Depending on the size of each buffer, and how often it's sent, it may require
> a larger amount of locked memory than usually available to non-root user.
> - If the required amount of locked memory is not available, writev_zerocopy
> will return an error, which can abort an operation like migration,
> - Because of this, when an user code wants to add zerocopy as a feature, it
> requires a mechanism to disable it, so it can still be accessible to less
> privileged users.
> 
> Signed-off-by: Leonardo Bras <leobras@redhat.com>
> ---
>  include/io/channel-socket.h |   2 +
>  include/io/channel.h        |   1 +
>  io/channel-socket.c         | 150 +++++++++++++++++++++++++++++++++++-
>  3 files changed, 150 insertions(+), 3 deletions(-)
> 
> diff --git a/include/io/channel-socket.h b/include/io/channel-socket.h
> index e747e63514..81d04baa4c 100644
> --- a/include/io/channel-socket.h
> +++ b/include/io/channel-socket.h
> @@ -47,6 +47,8 @@ struct QIOChannelSocket {
>      socklen_t localAddrLen;
>      struct sockaddr_storage remoteAddr;
>      socklen_t remoteAddrLen;
> +    ssize_t zerocopy_queued;
> +    ssize_t zerocopy_sent;
>  };
>  
>  
> diff --git a/include/io/channel.h b/include/io/channel.h
> index a19c09bb84..051fff4197 100644
> --- a/include/io/channel.h
> +++ b/include/io/channel.h
> @@ -31,6 +31,7 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
>  
>  
>  #define QIO_CHANNEL_ERR_BLOCK -2
> +#define QIO_CHANNEL_ERR_NOBUFS -3
>  
>  #define QIO_CHANNEL_WRITE_FLAG_ZEROCOPY 0x1
>  
> diff --git a/io/channel-socket.c b/io/channel-socket.c
> index b57a27bf91..c724b849ad 100644
> --- a/io/channel-socket.c
> +++ b/io/channel-socket.c
> @@ -26,6 +26,10 @@
>  #include "io/channel-watch.h"
>  #include "trace.h"
>  #include "qapi/clone-visitor.h"
> +#ifdef CONFIG_LINUX
> +#include <linux/errqueue.h>
> +#include <poll.h>
> +#endif
>  
>  #define SOCKET_MAX_FDS 16
>  
> @@ -55,6 +59,8 @@ qio_channel_socket_new(void)
>  
>      sioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET));
>      sioc->fd = -1;
> +    sioc->zerocopy_queued = 0;
> +    sioc->zerocopy_sent = 0;
>  
>      ioc = QIO_CHANNEL(sioc);
>      qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN);
> @@ -140,6 +146,7 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
>                                      Error **errp)
>  {
>      int fd;
> +    int ret, v = 1;
>  
>      trace_qio_channel_socket_connect_sync(ioc, addr);
>      fd = socket_connect(addr, errp);
> @@ -154,6 +161,15 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
>          return -1;
>      }
>  
> +#ifdef CONFIG_LINUX
> +    ret = qemu_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &v, sizeof(v));
> +    if (ret == 0) {
> +        /* Zerocopy available on host */
> +        qio_channel_set_feature(QIO_CHANNEL(ioc),
> +                                QIO_CHANNEL_FEATURE_WRITE_ZEROCOPY);
> +    }
> +#endif
> +
>      return 0;
>  }
>  
> @@ -561,12 +577,15 @@ static ssize_t qio_channel_socket_writev_flags(QIOChannel *ioc,
>   retry:
>      ret = sendmsg(sioc->fd, &msg, flags);
>      if (ret <= 0) {
> -        if (errno == EAGAIN) {
> +        switch (errno) {
> +        case EAGAIN:
>              return QIO_CHANNEL_ERR_BLOCK;
> -        }
> -        if (errno == EINTR) {
> +        case EINTR:
>              goto retry;
> +        case ENOBUFS:
> +            return QIO_CHANNEL_ERR_NOBUFS;

Why does ENOBUFS need handling separately instead of letting
the error_setg_errno below handle it ?

The caller immediately invokes error_setg_errno() again,
just with different error message.

No code in this series ever looks at QIO_CHANNEL_ERR_NOBUFS
either, so we don't even need that special error return code
added AFAICT ?

>          }
> +
>          error_setg_errno(errp, errno,
>                           "Unable to write to socket");
>          return -1;
> @@ -670,6 +689,127 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
>  }
>  #endif /* WIN32 */
>  
> +
> +#ifdef CONFIG_LINUX
> +
> +static int qio_channel_socket_poll(QIOChannelSocket *sioc, bool zerocopy,
> +                                   Error **errp)

There's only one caller and it always passes zerocopy=true,
so this parmeter looks pointless.

> +{
> +    struct pollfd pfd;
> +    int ret;
> +
> +    pfd.fd = sioc->fd;
> +    pfd.events = 0;
> +
> + retry:
> +    ret = poll(&pfd, 1, -1);
> +    if (ret < 0) {
> +        switch (errno) {
> +        case EAGAIN:
> +        case EINTR:
> +            goto retry;
> +        default:
> +            error_setg_errno(errp, errno,
> +                             "Poll error");
> +            return ret;

       return -1;

> +        }
> +    }
> +
> +    if (pfd.revents & (POLLHUP | POLLNVAL)) {
> +        error_setg(errp, "Poll error: Invalid or disconnected fd");
> +        return -1;
> +    }
> +
> +    if (!zerocopy && (pfd.revents & POLLERR)) {
> +        error_setg(errp, "Poll error: Errors present in errqueue");
> +        return -1;
> +    }

> +
> +    return ret;

  return 0;

> +}
> +
> +static ssize_t qio_channel_socket_writev_zerocopy(QIOChannel *ioc,
> +                                                  const struct iovec *iov,
> +                                                  size_t niov,
> +                                                  Error **errp)
> +{
> +    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
> +    ssize_t ret;
> +
> +    ret = qio_channel_socket_writev_flags(ioc, iov, niov, NULL, 0,
> +                                          MSG_ZEROCOPY, errp);
> +    if (ret == QIO_CHANNEL_ERR_NOBUFS) {
> +        error_setg_errno(errp, errno,
> +                         "Process can't lock enough memory for using MSG_ZEROCOPY");

This should not be touching errno - the method should be setting the
errp directly, not leaving it to the caller.

> +        return -1;
> +    }
> +
> +    sioc->zerocopy_queued++;

 if (ret > 0)
    sio->zerocopy_queued++

since the kernel doesn't increase the counter if the data sent
was zero length. A caller shouldn't be passing us a zero length
iov data element, but it is wise to be cautious

> +    return ret;
> +}
> +
> +static int qio_channel_socket_flush_zerocopy(QIOChannel *ioc,
> +                                             Error **errp)
> +{
> +    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
> +    struct msghdr msg = {};
> +    struct sock_extended_err *serr;
> +    struct cmsghdr *cm;
> +    char control[CMSG_SPACE(sizeof(*serr))];
> +    int ret;

Add

   int rv = 0;

> +
> +    msg.msg_control = control;
> +    msg.msg_controllen = sizeof(control);
> +    memset(control, 0, sizeof(control));
> +
> +    while (sioc->zerocopy_sent < sioc->zerocopy_queued) {
> +        ret = recvmsg(sioc->fd, &msg, MSG_ERRQUEUE);
> +        if (ret < 0) {
> +            switch (errno) {
> +            case EAGAIN:
> +                /* Nothing on errqueue, wait until something is available */
> +                ret = qio_channel_socket_poll(sioc, true, errp);
> +                if (ret < 0) {
> +                    return -1;
> +                }
> +                continue;
> +            case EINTR:
> +                continue;
> +            default:
> +                error_setg_errno(errp, errno,
> +                                 "Unable to read errqueue");
> +                return -1;
> +            }
> +        }
> +
> +        cm = CMSG_FIRSTHDR(&msg);
> +        if (cm->cmsg_level != SOL_IP &&
> +            cm->cmsg_type != IP_RECVERR) {
> +            error_setg_errno(errp, EPROTOTYPE,
> +                             "Wrong cmsg in errqueue");
> +            return -1;
> +        }
> +
> +        serr = (void *) CMSG_DATA(cm);
> +        if (serr->ee_errno != SO_EE_ORIGIN_NONE) {
> +            error_setg_errno(errp, serr->ee_errno,
> +                             "Error on socket");
> +            return -1;
> +        }
> +        if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
> +            error_setg_errno(errp, serr->ee_origin,
> +                             "Error not from zerocopy");
> +            return -1;
> +        }
> +
> +        /* No errors, count successfully finished sendmsg()*/
> +        sioc->zerocopy_sent += serr->ee_data - serr->ee_info + 1;

Here add


     if (ee_code ==  SO_EE_CODE_ZEROCOPY_COPIED)
        rv = 1;

> +    }
> +    return 0;

return rv;

> +}



Regards,
Daniel
Leonardo Bras Nov. 23, 2021, 4:46 a.m. UTC | #2
Hello Daniel,

On Fri, Nov 12, 2021 at 7:54 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
[...]
> > @@ -561,12 +577,15 @@ static ssize_t qio_channel_socket_writev_flags(QIOChannel *ioc,
> >   retry:
> >      ret = sendmsg(sioc->fd, &msg, flags);
> >      if (ret <= 0) {
> > -        if (errno == EAGAIN) {
> > +        switch (errno) {
> > +        case EAGAIN:
> >              return QIO_CHANNEL_ERR_BLOCK;
> > -        }
> > -        if (errno == EINTR) {
> > +        case EINTR:
> >              goto retry;
> > +        case ENOBUFS:
> > +            return QIO_CHANNEL_ERR_NOBUFS;
>
> Why does ENOBUFS need handling separately instead of letting
> the error_setg_errno below handle it ?
>
> The caller immediately invokes error_setg_errno() again,
> just with different error message.
>
> No code in this series ever looks at QIO_CHANNEL_ERR_NOBUFS
> either, so we don't even need that special error return code
> added AFAICT ?
>

The idea was to add a custom message for ENOBUFS return when sending
with MSG_ZEROCOPY.
I mean, having this message is important for the user to understand
why the migration is failing, but it would
not make any sense to have this message while a non-zerocopy sendmsg()
returns with ENOBUFS.

ENOBUFS : The output queue for a network interface was full.  This
generally indicates that the interface has stopped sending, but may be
caused by transient congestion.

As an alternative, I could add this message inside the switch, inside
an if (flags & MSG_ZEROCOPY) on qio_channel_socket_writev_flags()
instead of in it's caller.
But for me it looks bloated, I mean, dealing with an error for
ZEROCOPY only in the general function.

OTOH, if you think that it's a better idea to deal with every error in
qio_channel_socket_writev_flags() instead of in the caller, I will
change it for v6. Please let me know.

> >          }
> > +
> >          error_setg_errno(errp, errno,
> >                           "Unable to write to socket");
> >          return -1;
> > @@ -670,6 +689,127 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> >  }
> >  #endif /* WIN32 */
> >
> > +
> > +#ifdef CONFIG_LINUX
> > +
> > +static int qio_channel_socket_poll(QIOChannelSocket *sioc, bool zerocopy,
> > +                                   Error **errp)
>
> There's only one caller and it always passes zerocopy=true,
> so this parmeter looks pointless.

I did that for possible reuse of this function in the future:
- As of today, this is certainly compiled out, but if at some point
someone wants to use poll for something other
than the reading of an zerocopy errqueue, it could be reused.

But sure, if that's not desirable, I can remove the parameter (and the
if clause for !zerocopy).

>
> > +{
> > +    struct pollfd pfd;
> > +    int ret;
> > +
> > +    pfd.fd = sioc->fd;
> > +    pfd.events = 0;
> > +
> > + retry:
> > +    ret = poll(&pfd, 1, -1);
> > +    if (ret < 0) {
> > +        switch (errno) {
> > +        case EAGAIN:
> > +        case EINTR:
> > +            goto retry;
> > +        default:
> > +            error_setg_errno(errp, errno,
> > +                             "Poll error");
> > +            return ret;
>
>        return -1;
>
> > +        }
> > +    }
> > +
> > +    if (pfd.revents & (POLLHUP | POLLNVAL)) {
> > +        error_setg(errp, "Poll error: Invalid or disconnected fd");
> > +        return -1;
> > +    }
> > +
> > +    if (!zerocopy && (pfd.revents & POLLERR)) {
> > +        error_setg(errp, "Poll error: Errors present in errqueue");
> > +        return -1;
> > +    }
>
> > +
> > +    return ret;
>
>   return 0;

In the idea of future reuse I spoke above, returning zero here would
make this function always look like the poll timed out. Some future
users may want to repeat the waiting if poll() timed out, or if
(return > 0) stop polling.
(That was an earlier approach of this series)

>
> > +}
> > +
> > +static ssize_t qio_channel_socket_writev_zerocopy(QIOChannel *ioc,
> > +                                                  const struct iovec *iov,
> > +                                                  size_t niov,
> > +                                                  Error **errp)
> > +{
> > +    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
> > +    ssize_t ret;
> > +
> > +    ret = qio_channel_socket_writev_flags(ioc, iov, niov, NULL, 0,
> > +                                          MSG_ZEROCOPY, errp);
> > +    if (ret == QIO_CHANNEL_ERR_NOBUFS) {
> > +        error_setg_errno(errp, errno,
> > +                         "Process can't lock enough memory for using MSG_ZEROCOPY");
>
> This should not be touching errno - the method should be setting the
> errp directly, not leaving it to the caller.
>

Discussed above.
If you think that's a better approach I can change for v6.


> > +        return -1;
> > +    }
> > +
> > +    sioc->zerocopy_queued++;
>
>  if (ret > 0)
>     sio->zerocopy_queued++
>

Nice catch!

> since the kernel doesn't increase the counter if the data sent
> was zero length. A caller shouldn't be passing us a zero length
> iov data element, but it is wise to be cautious

Seems ok to me.

>
> > +    return ret;
> > +}
> > +
> > +static int qio_channel_socket_flush_zerocopy(QIOChannel *ioc,
> > +                                             Error **errp)
> > +{
> > +    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
> > +    struct msghdr msg = {};
> > +    struct sock_extended_err *serr;
> > +    struct cmsghdr *cm;
> > +    char control[CMSG_SPACE(sizeof(*serr))];
> > +    int ret;
>
> Add
>
>    int rv = 0;
>
> > +
> > +    msg.msg_control = control;
> > +    msg.msg_controllen = sizeof(control);
> > +    memset(control, 0, sizeof(control));
> > +
> > +    while (sioc->zerocopy_sent < sioc->zerocopy_queued) {
> > +        ret = recvmsg(sioc->fd, &msg, MSG_ERRQUEUE);
> > +        if (ret < 0) {
> > +            switch (errno) {
> > +            case EAGAIN:
> > +                /* Nothing on errqueue, wait until something is available */
> > +                ret = qio_channel_socket_poll(sioc, true, errp);
> > +                if (ret < 0) {
> > +                    return -1;
> > +                }
> > +                continue;
> > +            case EINTR:
> > +                continue;
> > +            default:
> > +                error_setg_errno(errp, errno,
> > +                                 "Unable to read errqueue");
> > +                return -1;
> > +            }
> > +        }
> > +
> > +        cm = CMSG_FIRSTHDR(&msg);
> > +        if (cm->cmsg_level != SOL_IP &&
> > +            cm->cmsg_type != IP_RECVERR) {
> > +            error_setg_errno(errp, EPROTOTYPE,
> > +                             "Wrong cmsg in errqueue");
> > +            return -1;
> > +        }
> > +
> > +        serr = (void *) CMSG_DATA(cm);
> > +        if (serr->ee_errno != SO_EE_ORIGIN_NONE) {
> > +            error_setg_errno(errp, serr->ee_errno,
> > +                             "Error on socket");
> > +            return -1;
> > +        }
> > +        if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
> > +            error_setg_errno(errp, serr->ee_origin,
> > +                             "Error not from zerocopy");
> > +            return -1;
> > +        }
> > +
> > +        /* No errors, count successfully finished sendmsg()*/
> > +        sioc->zerocopy_sent += serr->ee_data - serr->ee_info + 1;
>
> Here add
>
>
>      if (ee_code ==  SO_EE_CODE_ZEROCOPY_COPIED)
>         rv = 1;
>
> > +    }
> > +    return 0;
>
> return rv;
>
> > +}
>
>

I understand the idea of testing SO_EE_CODE_ZEROCOPY_COPIED to be able
to tell whenever zerocopy fell back to copying for some reason, but I
don't see how this can be helpful here.

Other than that I would do rv++ instead of rv=1 here, if I want to
keep track of how many buffers were sent with zerocopy and how many
ended up being copied.

>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>

Thanks for this feedback Daniel,

Best regards,
Leo
Daniel P. Berrangé Nov. 23, 2021, 9:55 a.m. UTC | #3
On Tue, Nov 23, 2021 at 01:46:44AM -0300, Leonardo Bras Soares Passos wrote:
> Hello Daniel,
> 
> On Fri, Nov 12, 2021 at 7:54 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> [...]
> > > @@ -561,12 +577,15 @@ static ssize_t qio_channel_socket_writev_flags(QIOChannel *ioc,
> > >   retry:
> > >      ret = sendmsg(sioc->fd, &msg, flags);
> > >      if (ret <= 0) {
> > > -        if (errno == EAGAIN) {
> > > +        switch (errno) {
> > > +        case EAGAIN:
> > >              return QIO_CHANNEL_ERR_BLOCK;
> > > -        }
> > > -        if (errno == EINTR) {
> > > +        case EINTR:
> > >              goto retry;
> > > +        case ENOBUFS:
> > > +            return QIO_CHANNEL_ERR_NOBUFS;
> >
> > Why does ENOBUFS need handling separately instead of letting
> > the error_setg_errno below handle it ?
> >
> > The caller immediately invokes error_setg_errno() again,
> > just with different error message.
> >
> > No code in this series ever looks at QIO_CHANNEL_ERR_NOBUFS
> > either, so we don't even need that special error return code
> > added AFAICT ?
> >
> 
> The idea was to add a custom message for ENOBUFS return when sending
> with MSG_ZEROCOPY.
> I mean, having this message is important for the user to understand
> why the migration is failing, but it would
> not make any sense to have this message while a non-zerocopy sendmsg()
> returns with ENOBUFS.
> 
> ENOBUFS : The output queue for a network interface was full.  This
> generally indicates that the interface has stopped sending, but may be
> caused by transient congestion.
> 
> As an alternative, I could add this message inside the switch, inside
> an if (flags & MSG_ZEROCOPY) on qio_channel_socket_writev_flags()
> instead of in it's caller.
> But for me it looks bloated, I mean, dealing with an error for
> ZEROCOPY only in the general function.

It is perfectly reasonable to check flags in this method.

> OTOH, if you think that it's a better idea to deal with every error in
> qio_channel_socket_writev_flags() instead of in the caller, I will
> change it for v6. Please let me know.

Yes, this method is already taking an ERror **errp parameter and
reporting a user facing error. If we need to report different
message text for ENOBUFS, it should be done in this method too.

The reason QIO_CHANNEL_ERR_BLOCK is special is because we are
explicitly not treating it as an error scenario at all.  That's
different to the ENOBUFS case.


> 
> > >          }
> > > +
> > >          error_setg_errno(errp, errno,
> > >                           "Unable to write to socket");
> > >          return -1;
> > > @@ -670,6 +689,127 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > >  }
> > >  #endif /* WIN32 */
> > >
> > > +
> > > +#ifdef CONFIG_LINUX
> > > +
> > > +static int qio_channel_socket_poll(QIOChannelSocket *sioc, bool zerocopy,
> > > +                                   Error **errp)
> >
> > There's only one caller and it always passes zerocopy=true,
> > so this parmeter looks pointless.
> 
> I did that for possible reuse of this function in the future:
> - As of today, this is certainly compiled out, but if at some point
> someone wants to use poll for something other
> than the reading of an zerocopy errqueue, it could be reused.
> 
> But sure, if that's not desirable, I can remove the parameter (and the
> if clause for !zerocopy).
> 
> >
> > > +{
> > > +    struct pollfd pfd;
> > > +    int ret;
> > > +
> > > +    pfd.fd = sioc->fd;
> > > +    pfd.events = 0;
> > > +
> > > + retry:
> > > +    ret = poll(&pfd, 1, -1);
> > > +    if (ret < 0) {
> > > +        switch (errno) {
> > > +        case EAGAIN:
> > > +        case EINTR:
> > > +            goto retry;
> > > +        default:
> > > +            error_setg_errno(errp, errno,
> > > +                             "Poll error");
> > > +            return ret;
> >
> >        return -1;
> >
> > > +        }
> > > +    }
> > > +
> > > +    if (pfd.revents & (POLLHUP | POLLNVAL)) {
> > > +        error_setg(errp, "Poll error: Invalid or disconnected fd");
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (!zerocopy && (pfd.revents & POLLERR)) {
> > > +        error_setg(errp, "Poll error: Errors present in errqueue");
> > > +        return -1;
> > > +    }
> >
> > > +
> > > +    return ret;
> >
> >   return 0;
> 
> In the idea of future reuse I spoke above, returning zero here would
> make this function always look like the poll timed out. Some future
> users may want to repeat the waiting if poll() timed out, or if
> (return > 0) stop polling.

Now that I'm looking again, we should not really use poll() at all,
as GLib provides us higher level APIs. We in fact already have the
qio_channel_wait() method as a general purpose helper for waiting
for an I/O condition to occur.;


> I understand the idea of testing SO_EE_CODE_ZEROCOPY_COPIED to be able
> to tell whenever zerocopy fell back to copying for some reason, but I
> don't see how this can be helpful here.
> 
> Other than that I would do rv++ instead of rv=1 here, if I want to
> keep track of how many buffers were sent with zerocopy and how many
> ended up being copied.

Sure, we could do   "ret > 0 == number of buffers that were copied"
as the API contract, rather than just treating it as a boolean.



Regards,
Daniel
Leonardo Bras Dec. 3, 2021, 5:42 a.m. UTC | #4
Hello Daniel,

On Tue, Nov 23, 2021 at 6:56 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Tue, Nov 23, 2021 at 01:46:44AM -0300, Leonardo Bras Soares Passos wrote:
> > Hello Daniel,
> >
> > On Fri, Nov 12, 2021 at 7:54 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > [...]
> > > > @@ -561,12 +577,15 @@ static ssize_t qio_channel_socket_writev_flags(QIOChannel *ioc,
> > > >   retry:
> > > >      ret = sendmsg(sioc->fd, &msg, flags);
> > > >      if (ret <= 0) {
> > > > -        if (errno == EAGAIN) {
> > > > +        switch (errno) {
> > > > +        case EAGAIN:
> > > >              return QIO_CHANNEL_ERR_BLOCK;
> > > > -        }
> > > > -        if (errno == EINTR) {
> > > > +        case EINTR:
> > > >              goto retry;
> > > > +        case ENOBUFS:
> > > > +            return QIO_CHANNEL_ERR_NOBUFS;
> > >
> > > Why does ENOBUFS need handling separately instead of letting
> > > the error_setg_errno below handle it ?
> > >
> > > The caller immediately invokes error_setg_errno() again,
> > > just with different error message.
> > >
> > > No code in this series ever looks at QIO_CHANNEL_ERR_NOBUFS
> > > either, so we don't even need that special error return code
> > > added AFAICT ?
> > >
> >
> > The idea was to add a custom message for ENOBUFS return when sending
> > with MSG_ZEROCOPY.
> > I mean, having this message is important for the user to understand
> > why the migration is failing, but it would
> > not make any sense to have this message while a non-zerocopy sendmsg()
> > returns with ENOBUFS.
> >
> > ENOBUFS : The output queue for a network interface was full.  This
> > generally indicates that the interface has stopped sending, but may be
> > caused by transient congestion.
> >
> > As an alternative, I could add this message inside the switch, inside
> > an if (flags & MSG_ZEROCOPY) on qio_channel_socket_writev_flags()
> > instead of in it's caller.
> > But for me it looks bloated, I mean, dealing with an error for
> > ZEROCOPY only in the general function.
>
> It is perfectly reasonable to check flags in this method.
>
> > OTOH, if you think that it's a better idea to deal with every error in
> > qio_channel_socket_writev_flags() instead of in the caller, I will
> > change it for v6. Please let me know.
>
> Yes, this method is already taking an ERror **errp parameter and
> reporting a user facing error. If we need to report different
> message text for ENOBUFS, it should be done in this method too.
>
> The reason QIO_CHANNEL_ERR_BLOCK is special is because we are
> explicitly not treating it as an error scenario at all.  That's
> different to the ENOBUFS case.
>

Ok, I will change it for v6.

>
> >
> > > >          }
> > > > +
> > > >          error_setg_errno(errp, errno,
> > > >                           "Unable to write to socket");
> > > >          return -1;
> > > > @@ -670,6 +689,127 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > >  }
> > > >  #endif /* WIN32 */
> > > >
> > > > +
> > > > +#ifdef CONFIG_LINUX
> > > > +
> > > > +static int qio_channel_socket_poll(QIOChannelSocket *sioc, bool zerocopy,
> > > > +                                   Error **errp)
> > >
> > > There's only one caller and it always passes zerocopy=true,
> > > so this parmeter looks pointless.
> >
> > I did that for possible reuse of this function in the future:
> > - As of today, this is certainly compiled out, but if at some point
> > someone wants to use poll for something other
> > than the reading of an zerocopy errqueue, it could be reused.
> >
> > But sure, if that's not desirable, I can remove the parameter (and the
> > if clause for !zerocopy).
> >
> > >
> > > > +{
> > > > +    struct pollfd pfd;
> > > > +    int ret;
> > > > +
> > > > +    pfd.fd = sioc->fd;
> > > > +    pfd.events = 0;
> > > > +
> > > > + retry:
> > > > +    ret = poll(&pfd, 1, -1);
> > > > +    if (ret < 0) {
> > > > +        switch (errno) {
> > > > +        case EAGAIN:
> > > > +        case EINTR:
> > > > +            goto retry;
> > > > +        default:
> > > > +            error_setg_errno(errp, errno,
> > > > +                             "Poll error");
> > > > +            return ret;
> > >
> > >        return -1;
> > >
> > > > +        }
> > > > +    }
> > > > +
> > > > +    if (pfd.revents & (POLLHUP | POLLNVAL)) {
> > > > +        error_setg(errp, "Poll error: Invalid or disconnected fd");
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (!zerocopy && (pfd.revents & POLLERR)) {
> > > > +        error_setg(errp, "Poll error: Errors present in errqueue");
> > > > +        return -1;
> > > > +    }
> > >
> > > > +
> > > > +    return ret;
> > >
> > >   return 0;
> >
> > In the idea of future reuse I spoke above, returning zero here would
> > make this function always look like the poll timed out. Some future
> > users may want to repeat the waiting if poll() timed out, or if
> > (return > 0) stop polling.
>
> Now that I'm looking again, we should not really use poll() at all,
> as GLib provides us higher level APIs. We in fact already have the
> qio_channel_wait() method as a general purpose helper for waiting
> for an I/O condition to occur.;
>

So you suggest using
qio_channel_wait(sioc, G_IO_IN);
instead of creating the new qio_channel_socket_poll().

Is the above correct? I mean, is it as simple as that?

>
> > I understand the idea of testing SO_EE_CODE_ZEROCOPY_COPIED to be able
> > to tell whenever zerocopy fell back to copying for some reason, but I
> > don't see how this can be helpful here.
> >
> > Other than that I would do rv++ instead of rv=1 here, if I want to
> > keep track of how many buffers were sent with zerocopy and how many
> > ended up being copied.
>
> Sure, we could do   "ret > 0 == number of buffers that were copied"
> as the API contract, rather than just treating it as a boolean.

Ok, then you suggest the responsibility of checking the number of
writes with SO_EE_CODE_ZEROCOPY_COPIED, comparing with the total
number of writes,  and deciding whether to disable or not zerocopy
should be on the caller.

(Disabling zerocopy on a single SO_EE_CODE_ZEROCOPY_COPIED doesn't
seem a good idea also)

>
>
>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>

Best regards,
Leo
Daniel P. Berrangé Dec. 3, 2021, 9:17 a.m. UTC | #5
On Fri, Dec 03, 2021 at 02:42:19AM -0300, Leonardo Bras Soares Passos wrote:
> Hello Daniel,
> 
> On Tue, Nov 23, 2021 at 6:56 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> >
> > On Tue, Nov 23, 2021 at 01:46:44AM -0300, Leonardo Bras Soares Passos wrote:
> > > Hello Daniel,
> > >
> > > On Fri, Nov 12, 2021 at 7:54 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > > +
> > > > > +#ifdef CONFIG_LINUX
> > > > > +
> > > > > +static int qio_channel_socket_poll(QIOChannelSocket *sioc, bool zerocopy,
> > > > > +                                   Error **errp)
> > > >
> > > > There's only one caller and it always passes zerocopy=true,
> > > > so this parmeter looks pointless.
> > >
> > > I did that for possible reuse of this function in the future:
> > > - As of today, this is certainly compiled out, but if at some point
> > > someone wants to use poll for something other
> > > than the reading of an zerocopy errqueue, it could be reused.
> > >
> > > But sure, if that's not desirable, I can remove the parameter (and the
> > > if clause for !zerocopy).
> > >
> > > >
> > > > > +{
> > > > > +    struct pollfd pfd;
> > > > > +    int ret;
> > > > > +
> > > > > +    pfd.fd = sioc->fd;
> > > > > +    pfd.events = 0;
> > > > > +
> > > > > + retry:
> > > > > +    ret = poll(&pfd, 1, -1);
> > > > > +    if (ret < 0) {
> > > > > +        switch (errno) {
> > > > > +        case EAGAIN:
> > > > > +        case EINTR:
> > > > > +            goto retry;
> > > > > +        default:
> > > > > +            error_setg_errno(errp, errno,
> > > > > +                             "Poll error");
> > > > > +            return ret;
> > > >
> > > >        return -1;
> > > >
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    if (pfd.revents & (POLLHUP | POLLNVAL)) {
> > > > > +        error_setg(errp, "Poll error: Invalid or disconnected fd");
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (!zerocopy && (pfd.revents & POLLERR)) {
> > > > > +        error_setg(errp, "Poll error: Errors present in errqueue");
> > > > > +        return -1;
> > > > > +    }
> > > >
> > > > > +
> > > > > +    return ret;
> > > >
> > > >   return 0;
> > >
> > > In the idea of future reuse I spoke above, returning zero here would
> > > make this function always look like the poll timed out. Some future
> > > users may want to repeat the waiting if poll() timed out, or if
> > > (return > 0) stop polling.
> >
> > Now that I'm looking again, we should not really use poll() at all,
> > as GLib provides us higher level APIs. We in fact already have the
> > qio_channel_wait() method as a general purpose helper for waiting
> > for an I/O condition to occur.;
> >
> 
> So you suggest using
> qio_channel_wait(sioc, G_IO_IN);
> instead of creating the new qio_channel_socket_poll().
> 
> Is the above correct? I mean, is it as simple as that?

Yes, hopefully it is that simple.

> > > I understand the idea of testing SO_EE_CODE_ZEROCOPY_COPIED to be able
> > > to tell whenever zerocopy fell back to copying for some reason, but I
> > > don't see how this can be helpful here.
> > >
> > > Other than that I would do rv++ instead of rv=1 here, if I want to
> > > keep track of how many buffers were sent with zerocopy and how many
> > > ended up being copied.
> >
> > Sure, we could do   "ret > 0 == number of buffers that were copied"
> > as the API contract, rather than just treating it as a boolean.
> 
> Ok, then you suggest the responsibility of checking the number of
> writes with SO_EE_CODE_ZEROCOPY_COPIED, comparing with the total
> number of writes,  and deciding whether to disable or not zerocopy
> should be on the caller.

Yep, its a usage policy so nicer to allow caller to decide the
policy.

Regards,
Daniel
Leonardo Bras Dec. 9, 2021, 8:38 a.m. UTC | #6
Hello Daniel,

On Fri, Dec 3, 2021 at 6:18 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Fri, Dec 03, 2021 at 02:42:19AM -0300, Leonardo Bras Soares Passos wrote:
> > Hello Daniel,
> >
> > On Tue, Nov 23, 2021 at 6:56 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > >
> > > On Tue, Nov 23, 2021 at 01:46:44AM -0300, Leonardo Bras Soares Passos wrote:
> > > > Hello Daniel,
> > > >
> > > > On Fri, Nov 12, 2021 at 7:54 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > > > +
> > > > > > +#ifdef CONFIG_LINUX
> > > > > > +
> > > > > > +static int qio_channel_socket_poll(QIOChannelSocket *sioc, bool zerocopy,
> > > > > > +                                   Error **errp)
> > > > >
> > > > > There's only one caller and it always passes zerocopy=true,
> > > > > so this parmeter looks pointless.
> > > >
> > > > I did that for possible reuse of this function in the future:
> > > > - As of today, this is certainly compiled out, but if at some point
> > > > someone wants to use poll for something other
> > > > than the reading of an zerocopy errqueue, it could be reused.
> > > >
> > > > But sure, if that's not desirable, I can remove the parameter (and the
> > > > if clause for !zerocopy).
> > > >
> > > > >
> > > > > > +{
> > > > > > +    struct pollfd pfd;
> > > > > > +    int ret;
> > > > > > +
> > > > > > +    pfd.fd = sioc->fd;
> > > > > > +    pfd.events = 0;
> > > > > > +
> > > > > > + retry:
> > > > > > +    ret = poll(&pfd, 1, -1);
> > > > > > +    if (ret < 0) {
> > > > > > +        switch (errno) {
> > > > > > +        case EAGAIN:
> > > > > > +        case EINTR:
> > > > > > +            goto retry;
> > > > > > +        default:
> > > > > > +            error_setg_errno(errp, errno,
> > > > > > +                             "Poll error");
> > > > > > +            return ret;
> > > > >
> > > > >        return -1;
> > > > >
> > > > > > +        }
> > > > > > +    }
> > > > > > +
> > > > > > +    if (pfd.revents & (POLLHUP | POLLNVAL)) {
> > > > > > +        error_setg(errp, "Poll error: Invalid or disconnected fd");
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (!zerocopy && (pfd.revents & POLLERR)) {
> > > > > > +        error_setg(errp, "Poll error: Errors present in errqueue");
> > > > > > +        return -1;
> > > > > > +    }
> > > > >
> > > > > > +
> > > > > > +    return ret;
> > > > >
> > > > >   return 0;
> > > >
> > > > In the idea of future reuse I spoke above, returning zero here would
> > > > make this function always look like the poll timed out. Some future
> > > > users may want to repeat the waiting if poll() timed out, or if
> > > > (return > 0) stop polling.
> > >
> > > Now that I'm looking again, we should not really use poll() at all,
> > > as GLib provides us higher level APIs. We in fact already have the
> > > qio_channel_wait() method as a general purpose helper for waiting
> > > for an I/O condition to occur.;
> > >
> >
> > So you suggest using
> > qio_channel_wait(sioc, G_IO_IN);
> > instead of creating the new qio_channel_socket_poll().
> >
> > Is the above correct? I mean, is it as simple as that?
>
> Yes, hopefully it is that simple.

It seems not to be the case.
After some testing, I found out using this stalls the migration.

This happens when multifd_send_sync_main() calls flush_zerocopy(), but
the migration threads are
in multifd_send_thread() calling qemu_sem_wait(&p->sem);

I don't really understand enough of GLib to know how much this is
different from a poll(), but seems to make a difference.

>
> > > > I understand the idea of testing SO_EE_CODE_ZEROCOPY_COPIED to be able
> > > > to tell whenever zerocopy fell back to copying for some reason, but I
> > > > don't see how this can be helpful here.
> > > >
> > > > Other than that I would do rv++ instead of rv=1 here, if I want to
> > > > keep track of how many buffers were sent with zerocopy and how many
> > > > ended up being copied.
> > >
> > > Sure, we could do   "ret > 0 == number of buffers that were copied"
> > > as the API contract, rather than just treating it as a boolean.
> >
> > Ok, then you suggest the responsibility of checking the number of
> > writes with SO_EE_CODE_ZEROCOPY_COPIED, comparing with the total
> > number of writes,  and deciding whether to disable or not zerocopy
> > should be on the caller.
>
> Yep, its a usage policy so nicer to allow caller to decide the
> policy.
>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>
Leonardo Bras Dec. 9, 2021, 8:49 a.m. UTC | #7
On Thu, Dec 9, 2021 at 5:38 AM Leonardo Bras Soares Passos
<leobras@redhat.com> wrote:
>
> Hello Daniel,
>
> On Fri, Dec 3, 2021 at 6:18 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> >
> > On Fri, Dec 03, 2021 at 02:42:19AM -0300, Leonardo Bras Soares Passos wrote:
> > > Hello Daniel,
> > >
> > > On Tue, Nov 23, 2021 at 6:56 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > >
> > > > On Tue, Nov 23, 2021 at 01:46:44AM -0300, Leonardo Bras Soares Passos wrote:
> > > > > Hello Daniel,
> > > > >
> > > > > On Fri, Nov 12, 2021 at 7:54 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > > > > +
> > > > > > > +#ifdef CONFIG_LINUX
> > > > > > > +
> > > > > > > +static int qio_channel_socket_poll(QIOChannelSocket *sioc, bool zerocopy,
> > > > > > > +                                   Error **errp)
> > > > > >
> > > > > > There's only one caller and it always passes zerocopy=true,
> > > > > > so this parmeter looks pointless.
> > > > >
> > > > > I did that for possible reuse of this function in the future:
> > > > > - As of today, this is certainly compiled out, but if at some point
> > > > > someone wants to use poll for something other
> > > > > than the reading of an zerocopy errqueue, it could be reused.
> > > > >
> > > > > But sure, if that's not desirable, I can remove the parameter (and the
> > > > > if clause for !zerocopy).
> > > > >
> > > > > >
> > > > > > > +{
> > > > > > > +    struct pollfd pfd;
> > > > > > > +    int ret;
> > > > > > > +
> > > > > > > +    pfd.fd = sioc->fd;
> > > > > > > +    pfd.events = 0;
> > > > > > > +
> > > > > > > + retry:
> > > > > > > +    ret = poll(&pfd, 1, -1);
> > > > > > > +    if (ret < 0) {
> > > > > > > +        switch (errno) {
> > > > > > > +        case EAGAIN:
> > > > > > > +        case EINTR:
> > > > > > > +            goto retry;
> > > > > > > +        default:
> > > > > > > +            error_setg_errno(errp, errno,
> > > > > > > +                             "Poll error");
> > > > > > > +            return ret;
> > > > > >
> > > > > >        return -1;
> > > > > >
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    if (pfd.revents & (POLLHUP | POLLNVAL)) {
> > > > > > > +        error_setg(errp, "Poll error: Invalid or disconnected fd");
> > > > > > > +        return -1;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    if (!zerocopy && (pfd.revents & POLLERR)) {
> > > > > > > +        error_setg(errp, "Poll error: Errors present in errqueue");
> > > > > > > +        return -1;
> > > > > > > +    }
> > > > > >
> > > > > > > +
> > > > > > > +    return ret;
> > > > > >
> > > > > >   return 0;
> > > > >
> > > > > In the idea of future reuse I spoke above, returning zero here would
> > > > > make this function always look like the poll timed out. Some future
> > > > > users may want to repeat the waiting if poll() timed out, or if
> > > > > (return > 0) stop polling.
> > > >
> > > > Now that I'm looking again, we should not really use poll() at all,
> > > > as GLib provides us higher level APIs. We in fact already have the
> > > > qio_channel_wait() method as a general purpose helper for waiting
> > > > for an I/O condition to occur.;
> > > >
> > >
> > > So you suggest using
> > > qio_channel_wait(sioc, G_IO_IN);
> > > instead of creating the new qio_channel_socket_poll().
> > >
> > > Is the above correct? I mean, is it as simple as that?
> >
> > Yes, hopefully it is that simple.
>
> It seems not to be the case.
> After some testing, I found out using this stalls the migration.
>
> This happens when multifd_send_sync_main() calls flush_zerocopy(), but
> the migration threads are
> in multifd_send_thread() calling qemu_sem_wait(&p->sem);
>
> I don't really understand enough of GLib to know how much this is
> different from a poll(), but seems to make a difference.

Oh, nevermind.
A few minutes reading GLib docs was enough for me to understand my mistake.
We will need to use G_IO_ERR instead of G_IO_IN, because we are
waiting for messages
in the ERRQUEUE.

>
> >
> > > > > I understand the idea of testing SO_EE_CODE_ZEROCOPY_COPIED to be able
> > > > > to tell whenever zerocopy fell back to copying for some reason, but I
> > > > > don't see how this can be helpful here.
> > > > >
> > > > > Other than that I would do rv++ instead of rv=1 here, if I want to
> > > > > keep track of how many buffers were sent with zerocopy and how many
> > > > > ended up being copied.
> > > >
> > > > Sure, we could do   "ret > 0 == number of buffers that were copied"
> > > > as the API contract, rather than just treating it as a boolean.
> > >
> > > Ok, then you suggest the responsibility of checking the number of
> > > writes with SO_EE_CODE_ZEROCOPY_COPIED, comparing with the total
> > > number of writes,  and deciding whether to disable or not zerocopy
> > > should be on the caller.
> >
> > Yep, its a usage policy so nicer to allow caller to decide the
> > policy.
> >
> > Regards,
> > Daniel
> > --
> > |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> > |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> > |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
> >
diff mbox series

Patch

diff --git a/include/io/channel-socket.h b/include/io/channel-socket.h
index e747e63514..81d04baa4c 100644
--- a/include/io/channel-socket.h
+++ b/include/io/channel-socket.h
@@ -47,6 +47,8 @@  struct QIOChannelSocket {
     socklen_t localAddrLen;
     struct sockaddr_storage remoteAddr;
     socklen_t remoteAddrLen;
+    ssize_t zerocopy_queued;
+    ssize_t zerocopy_sent;
 };
 
 
diff --git a/include/io/channel.h b/include/io/channel.h
index a19c09bb84..051fff4197 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -31,6 +31,7 @@  OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 
 
 #define QIO_CHANNEL_ERR_BLOCK -2
+#define QIO_CHANNEL_ERR_NOBUFS -3
 
 #define QIO_CHANNEL_WRITE_FLAG_ZEROCOPY 0x1
 
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b57a27bf91..c724b849ad 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -26,6 +26,10 @@ 
 #include "io/channel-watch.h"
 #include "trace.h"
 #include "qapi/clone-visitor.h"
+#ifdef CONFIG_LINUX
+#include <linux/errqueue.h>
+#include <poll.h>
+#endif
 
 #define SOCKET_MAX_FDS 16
 
@@ -55,6 +59,8 @@  qio_channel_socket_new(void)
 
     sioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET));
     sioc->fd = -1;
+    sioc->zerocopy_queued = 0;
+    sioc->zerocopy_sent = 0;
 
     ioc = QIO_CHANNEL(sioc);
     qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN);
@@ -140,6 +146,7 @@  int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
                                     Error **errp)
 {
     int fd;
+    int ret, v = 1;
 
     trace_qio_channel_socket_connect_sync(ioc, addr);
     fd = socket_connect(addr, errp);
@@ -154,6 +161,15 @@  int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
         return -1;
     }
 
+#ifdef CONFIG_LINUX
+    ret = qemu_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &v, sizeof(v));
+    if (ret == 0) {
+        /* Zerocopy available on host */
+        qio_channel_set_feature(QIO_CHANNEL(ioc),
+                                QIO_CHANNEL_FEATURE_WRITE_ZEROCOPY);
+    }
+#endif
+
     return 0;
 }
 
@@ -561,12 +577,15 @@  static ssize_t qio_channel_socket_writev_flags(QIOChannel *ioc,
  retry:
     ret = sendmsg(sioc->fd, &msg, flags);
     if (ret <= 0) {
-        if (errno == EAGAIN) {
+        switch (errno) {
+        case EAGAIN:
             return QIO_CHANNEL_ERR_BLOCK;
-        }
-        if (errno == EINTR) {
+        case EINTR:
             goto retry;
+        case ENOBUFS:
+            return QIO_CHANNEL_ERR_NOBUFS;
         }
+
         error_setg_errno(errp, errno,
                          "Unable to write to socket");
         return -1;
@@ -670,6 +689,127 @@  static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
 }
 #endif /* WIN32 */
 
+
+#ifdef CONFIG_LINUX
+
+static int qio_channel_socket_poll(QIOChannelSocket *sioc, bool zerocopy,
+                                   Error **errp)
+{
+    struct pollfd pfd;
+    int ret;
+
+    pfd.fd = sioc->fd;
+    pfd.events = 0;
+
+ retry:
+    ret = poll(&pfd, 1, -1);
+    if (ret < 0) {
+        switch (errno) {
+        case EAGAIN:
+        case EINTR:
+            goto retry;
+        default:
+            error_setg_errno(errp, errno,
+                             "Poll error");
+            return ret;
+        }
+    }
+
+    if (pfd.revents & (POLLHUP | POLLNVAL)) {
+        error_setg(errp, "Poll error: Invalid or disconnected fd");
+        return -1;
+    }
+
+    if (!zerocopy && (pfd.revents & POLLERR)) {
+        error_setg(errp, "Poll error: Errors present in errqueue");
+        return -1;
+    }
+
+    return ret;
+}
+
+static ssize_t qio_channel_socket_writev_zerocopy(QIOChannel *ioc,
+                                                  const struct iovec *iov,
+                                                  size_t niov,
+                                                  Error **errp)
+{
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    ssize_t ret;
+
+    ret = qio_channel_socket_writev_flags(ioc, iov, niov, NULL, 0,
+                                          MSG_ZEROCOPY, errp);
+    if (ret == QIO_CHANNEL_ERR_NOBUFS) {
+        error_setg_errno(errp, errno,
+                         "Process can't lock enough memory for using MSG_ZEROCOPY");
+        return -1;
+    }
+
+    sioc->zerocopy_queued++;
+    return ret;
+}
+
+static int qio_channel_socket_flush_zerocopy(QIOChannel *ioc,
+                                             Error **errp)
+{
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    struct msghdr msg = {};
+    struct sock_extended_err *serr;
+    struct cmsghdr *cm;
+    char control[CMSG_SPACE(sizeof(*serr))];
+    int ret;
+
+    msg.msg_control = control;
+    msg.msg_controllen = sizeof(control);
+    memset(control, 0, sizeof(control));
+
+    while (sioc->zerocopy_sent < sioc->zerocopy_queued) {
+        ret = recvmsg(sioc->fd, &msg, MSG_ERRQUEUE);
+        if (ret < 0) {
+            switch (errno) {
+            case EAGAIN:
+                /* Nothing on errqueue, wait until something is available */
+                ret = qio_channel_socket_poll(sioc, true, errp);
+                if (ret < 0) {
+                    return -1;
+                }
+                continue;
+            case EINTR:
+                continue;
+            default:
+                error_setg_errno(errp, errno,
+                                 "Unable to read errqueue");
+                return -1;
+            }
+        }
+
+        cm = CMSG_FIRSTHDR(&msg);
+        if (cm->cmsg_level != SOL_IP &&
+            cm->cmsg_type != IP_RECVERR) {
+            error_setg_errno(errp, EPROTOTYPE,
+                             "Wrong cmsg in errqueue");
+            return -1;
+        }
+
+        serr = (void *) CMSG_DATA(cm);
+        if (serr->ee_errno != SO_EE_ORIGIN_NONE) {
+            error_setg_errno(errp, serr->ee_errno,
+                             "Error on socket");
+            return -1;
+        }
+        if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
+            error_setg_errno(errp, serr->ee_origin,
+                             "Error not from zerocopy");
+            return -1;
+        }
+
+        /* No errors, count successfully finished sendmsg()*/
+        sioc->zerocopy_sent += serr->ee_data - serr->ee_info + 1;
+    }
+    return 0;
+}
+
+#endif /* CONFIG_LINUX */
+
 static int
 qio_channel_socket_set_blocking(QIOChannel *ioc,
                                 bool enabled,
@@ -799,6 +939,10 @@  static void qio_channel_socket_class_init(ObjectClass *klass,
     ioc_klass->io_set_delay = qio_channel_socket_set_delay;
     ioc_klass->io_create_watch = qio_channel_socket_create_watch;
     ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
+#ifdef CONFIG_LINUX
+    ioc_klass->io_writev_zerocopy = qio_channel_socket_writev_zerocopy;
+    ioc_klass->io_flush_zerocopy = qio_channel_socket_flush_zerocopy;
+#endif
 }
 
 static const TypeInfo qio_channel_socket_info = {