diff mbox series

[v7,2/5] QIOChannelSocket: Implement io_writev zero copy flag & io_flush for CONFIG_LINUX

Message ID 20220106221341.8779-3-leobras@redhat.com (mailing list archive)
State New, archived
Headers show
Series MSG_ZEROCOPY + multifd | expand

Commit Message

Leonardo Bras Jan. 6, 2022, 10:13 p.m. UTC
For CONFIG_LINUX, implement the new zero copy flag and the optional callback
io_flush on QIOChannelSocket, but enables it only when MSG_ZEROCOPY
feature is available in the host kernel, which is checked on
qio_channel_socket_connect_sync()

qio_channel_socket_flush() was implemented by counting how many times
sendmsg(...,MSG_ZEROCOPY) was successfully called, and then reading the
socket's error queue, in order to find how many of them finished sending.
Flush will loop until those counters are the same, or until some error occurs.

Notes on using writev() with QIO_CHANNEL_WRITE_FLAG_ZERO_COPY:
1: Buffer
- As MSG_ZEROCOPY tells the kernel to use the same user buffer to avoid copying,
some caution is necessary to avoid overwriting any buffer before it's sent.
If something like this happen, a newer version of the buffer may be sent instead.
- If this is a problem, it's recommended to call qio_channel_flush() before freeing
or re-using the buffer.

2: Locked memory
- When using MSG_ZERCOCOPY, the buffer memory will be locked after queued, and
unlocked after it's sent.
- Depending on the size of each buffer, and how often it's sent, it may require
a larger amount of locked memory than usually available to non-root user.
- If the required amount of locked memory is not available, writev_zero_copy
will return an error, which can abort an operation like migration,
- Because of this, when an user code wants to add zero copy as a feature, it
requires a mechanism to disable it, so it can still be accessible to less
privileged users.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
---
 include/io/channel-socket.h |   2 +
 io/channel-socket.c         | 107 ++++++++++++++++++++++++++++++++++--
 2 files changed, 105 insertions(+), 4 deletions(-)

Comments

Peter Xu Jan. 13, 2022, 6:48 a.m. UTC | #1
On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
>          memcpy(CMSG_DATA(cmsg), fds, fdsize);
>      }
>  
> +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> +        sflags = MSG_ZEROCOPY;
> +    }
> +
>   retry:
> -    ret = sendmsg(sioc->fd, &msg, 0);
> +    ret = sendmsg(sioc->fd, &msg, sflags);
>      if (ret <= 0) {
> -        if (errno == EAGAIN) {
> +        switch (errno) {
> +        case EAGAIN:
>              return QIO_CHANNEL_ERR_BLOCK;
> -        }
> -        if (errno == EINTR) {
> +        case EINTR:
>              goto retry;
> +        case ENOBUFS:
> +            if (sflags & MSG_ZEROCOPY) {
> +                error_setg_errno(errp, errno,
> +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> +                return -1;
> +            }

I have no idea whether it'll make a real differnece, but - should we better add
a "break" here?  If you agree and with that fixed, feel free to add:

Reviewed-by: Peter Xu <peterx@redhat.com>

I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
here it's by default unlimited, but just curious when we should keep an eye.

Thanks,
Daniel P. Berrangé Jan. 13, 2022, 10:06 a.m. UTC | #2
On Thu, Jan 13, 2022 at 02:48:15PM +0800, Peter Xu wrote:
> On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> >      }
> >  
> > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > +        sflags = MSG_ZEROCOPY;
> > +    }
> > +
> >   retry:
> > -    ret = sendmsg(sioc->fd, &msg, 0);
> > +    ret = sendmsg(sioc->fd, &msg, sflags);
> >      if (ret <= 0) {
> > -        if (errno == EAGAIN) {
> > +        switch (errno) {
> > +        case EAGAIN:
> >              return QIO_CHANNEL_ERR_BLOCK;
> > -        }
> > -        if (errno == EINTR) {
> > +        case EINTR:
> >              goto retry;
> > +        case ENOBUFS:
> > +            if (sflags & MSG_ZEROCOPY) {
> > +                error_setg_errno(errp, errno,
> > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > +                return -1;
> > +            }
> 
> I have no idea whether it'll make a real differnece, but - should we better add
> a "break" here?  If you agree and with that fixed, feel free to add:
> 
> Reviewed-by: Peter Xu <peterx@redhat.com>
> 
> I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> here it's by default unlimited, but just curious when we should keep an eye.

Fedora doesn't allow unlimited locked memory by default

$ grep "locked memory" /proc/self/limits 
Max locked memory         65536                65536                bytes     

And  regardless of Fedora defaults, libvirt will set a limit
for the guest. It will only be unlimited if requiring certain
things like VFIO.

Regards,
Daniel
Peter Xu Jan. 13, 2022, 10:34 a.m. UTC | #3
On Thu, Jan 13, 2022 at 10:06:14AM +0000, Daniel P. Berrangé wrote:
> On Thu, Jan 13, 2022 at 02:48:15PM +0800, Peter Xu wrote:
> > On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> > >      }
> > >  
> > > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > > +        sflags = MSG_ZEROCOPY;
> > > +    }
> > > +
> > >   retry:
> > > -    ret = sendmsg(sioc->fd, &msg, 0);
> > > +    ret = sendmsg(sioc->fd, &msg, sflags);
> > >      if (ret <= 0) {
> > > -        if (errno == EAGAIN) {
> > > +        switch (errno) {
> > > +        case EAGAIN:
> > >              return QIO_CHANNEL_ERR_BLOCK;
> > > -        }
> > > -        if (errno == EINTR) {
> > > +        case EINTR:
> > >              goto retry;
> > > +        case ENOBUFS:
> > > +            if (sflags & MSG_ZEROCOPY) {
> > > +                error_setg_errno(errp, errno,
> > > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > > +                return -1;
> > > +            }
> > 
> > I have no idea whether it'll make a real differnece, but - should we better add
> > a "break" here?  If you agree and with that fixed, feel free to add:
> > 
> > Reviewed-by: Peter Xu <peterx@redhat.com>
> > 
> > I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> > here it's by default unlimited, but just curious when we should keep an eye.
> 
> Fedora doesn't allow unlimited locked memory by default
> 
> $ grep "locked memory" /proc/self/limits 
> Max locked memory         65536                65536                bytes     
> 
> And  regardless of Fedora defaults, libvirt will set a limit
> for the guest. It will only be unlimited if requiring certain
> things like VFIO.

Thanks, I obviously checked up the wrong host..

Leo, do you know how much locked memory will be needed by zero copy?  Will
there be a limit?  Is it linear to the number of sockets/channels?

It'll be better if we can fail at enabling the feature when we detected that
the specified locked memory limit may not be suffice.
Daniel P. Berrangé Jan. 13, 2022, 10:42 a.m. UTC | #4
On Thu, Jan 13, 2022 at 06:34:12PM +0800, Peter Xu wrote:
> On Thu, Jan 13, 2022 at 10:06:14AM +0000, Daniel P. Berrangé wrote:
> > On Thu, Jan 13, 2022 at 02:48:15PM +0800, Peter Xu wrote:
> > > On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > > > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> > > >      }
> > > >  
> > > > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > > > +        sflags = MSG_ZEROCOPY;
> > > > +    }
> > > > +
> > > >   retry:
> > > > -    ret = sendmsg(sioc->fd, &msg, 0);
> > > > +    ret = sendmsg(sioc->fd, &msg, sflags);
> > > >      if (ret <= 0) {
> > > > -        if (errno == EAGAIN) {
> > > > +        switch (errno) {
> > > > +        case EAGAIN:
> > > >              return QIO_CHANNEL_ERR_BLOCK;
> > > > -        }
> > > > -        if (errno == EINTR) {
> > > > +        case EINTR:
> > > >              goto retry;
> > > > +        case ENOBUFS:
> > > > +            if (sflags & MSG_ZEROCOPY) {
> > > > +                error_setg_errno(errp, errno,
> > > > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > > > +                return -1;
> > > > +            }
> > > 
> > > I have no idea whether it'll make a real differnece, but - should we better add
> > > a "break" here?  If you agree and with that fixed, feel free to add:
> > > 
> > > Reviewed-by: Peter Xu <peterx@redhat.com>
> > > 
> > > I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> > > here it's by default unlimited, but just curious when we should keep an eye.
> > 
> > Fedora doesn't allow unlimited locked memory by default
> > 
> > $ grep "locked memory" /proc/self/limits 
> > Max locked memory         65536                65536                bytes     
> > 
> > And  regardless of Fedora defaults, libvirt will set a limit
> > for the guest. It will only be unlimited if requiring certain
> > things like VFIO.
> 
> Thanks, I obviously checked up the wrong host..
> 
> Leo, do you know how much locked memory will be needed by zero copy?  Will
> there be a limit?  Is it linear to the number of sockets/channels?

IIRC we decided it would be limited by the socket send buffer size, rather
than guest RAM, because writes will block once the send buffer is full.

This has a default global setting, with per-socket override. On one box I
have it is 200 Kb. With multifd you'll need  "num-sockets * send buffer".

> It'll be better if we can fail at enabling the feature when we detected that
> the specified locked memory limit may not be suffice.

Checking this value against available locked memory though will always
have an error margin because other things in QEMU can use locked memory
too


Regards,
Daniel
Peter Xu Jan. 13, 2022, 12:12 p.m. UTC | #5
On Thu, Jan 13, 2022 at 10:42:39AM +0000, Daniel P. Berrangé wrote:
> On Thu, Jan 13, 2022 at 06:34:12PM +0800, Peter Xu wrote:
> > On Thu, Jan 13, 2022 at 10:06:14AM +0000, Daniel P. Berrangé wrote:
> > > On Thu, Jan 13, 2022 at 02:48:15PM +0800, Peter Xu wrote:
> > > > On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > > > > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> > > > >      }
> > > > >  
> > > > > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > > > > +        sflags = MSG_ZEROCOPY;
> > > > > +    }
> > > > > +
> > > > >   retry:
> > > > > -    ret = sendmsg(sioc->fd, &msg, 0);
> > > > > +    ret = sendmsg(sioc->fd, &msg, sflags);
> > > > >      if (ret <= 0) {
> > > > > -        if (errno == EAGAIN) {
> > > > > +        switch (errno) {
> > > > > +        case EAGAIN:
> > > > >              return QIO_CHANNEL_ERR_BLOCK;
> > > > > -        }
> > > > > -        if (errno == EINTR) {
> > > > > +        case EINTR:
> > > > >              goto retry;
> > > > > +        case ENOBUFS:
> > > > > +            if (sflags & MSG_ZEROCOPY) {
> > > > > +                error_setg_errno(errp, errno,
> > > > > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > > > > +                return -1;
> > > > > +            }
> > > > 
> > > > I have no idea whether it'll make a real differnece, but - should we better add
> > > > a "break" here?  If you agree and with that fixed, feel free to add:
> > > > 
> > > > Reviewed-by: Peter Xu <peterx@redhat.com>
> > > > 
> > > > I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> > > > here it's by default unlimited, but just curious when we should keep an eye.
> > > 
> > > Fedora doesn't allow unlimited locked memory by default
> > > 
> > > $ grep "locked memory" /proc/self/limits 
> > > Max locked memory         65536                65536                bytes     
> > > 
> > > And  regardless of Fedora defaults, libvirt will set a limit
> > > for the guest. It will only be unlimited if requiring certain
> > > things like VFIO.
> > 
> > Thanks, I obviously checked up the wrong host..
> > 
> > Leo, do you know how much locked memory will be needed by zero copy?  Will
> > there be a limit?  Is it linear to the number of sockets/channels?
> 
> IIRC we decided it would be limited by the socket send buffer size, rather
> than guest RAM, because writes will block once the send buffer is full.
> 
> This has a default global setting, with per-socket override. On one box I
> have it is 200 Kb. With multifd you'll need  "num-sockets * send buffer".
> 
> > It'll be better if we can fail at enabling the feature when we detected that
> > the specified locked memory limit may not be suffice.
> 
> Checking this value against available locked memory though will always
> have an error margin because other things in QEMU can use locked memory
> too

We could always still allow false positive in this check, so we can fail if we
have a solid clue to know we'll fail later (e.g. minimum locked_vm needed is
already less than total).  But no strong opinion; we could have this merged and
see whether that's needed in real life.  Thanks,
Daniel P. Berrangé Jan. 13, 2022, 1:05 p.m. UTC | #6
On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> For CONFIG_LINUX, implement the new zero copy flag and the optional callback
> io_flush on QIOChannelSocket, but enables it only when MSG_ZEROCOPY
> feature is available in the host kernel, which is checked on
> qio_channel_socket_connect_sync()
> 
> qio_channel_socket_flush() was implemented by counting how many times
> sendmsg(...,MSG_ZEROCOPY) was successfully called, and then reading the
> socket's error queue, in order to find how many of them finished sending.
> Flush will loop until those counters are the same, or until some error occurs.
> 
> Notes on using writev() with QIO_CHANNEL_WRITE_FLAG_ZERO_COPY:
> 1: Buffer
> - As MSG_ZEROCOPY tells the kernel to use the same user buffer to avoid copying,
> some caution is necessary to avoid overwriting any buffer before it's sent.
> If something like this happen, a newer version of the buffer may be sent instead.
> - If this is a problem, it's recommended to call qio_channel_flush() before freeing
> or re-using the buffer.
> 
> 2: Locked memory
> - When using MSG_ZERCOCOPY, the buffer memory will be locked after queued, and
> unlocked after it's sent.
> - Depending on the size of each buffer, and how often it's sent, it may require
> a larger amount of locked memory than usually available to non-root user.
> - If the required amount of locked memory is not available, writev_zero_copy
> will return an error, which can abort an operation like migration,
> - Because of this, when an user code wants to add zero copy as a feature, it
> requires a mechanism to disable it, so it can still be accessible to less
> privileged users.
> 
> Signed-off-by: Leonardo Bras <leobras@redhat.com>
> ---
>  include/io/channel-socket.h |   2 +
>  io/channel-socket.c         | 107 ++++++++++++++++++++++++++++++++++--
>  2 files changed, 105 insertions(+), 4 deletions(-)

Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>


Regards,
Daniel
Leonardo Bras Jan. 19, 2022, 5:01 p.m. UTC | #7
Hello Peter,

On Thu, Jan 13, 2022 at 3:48 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> >      }
> >
> > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > +        sflags = MSG_ZEROCOPY;
> > +    }
> > +
> >   retry:
> > -    ret = sendmsg(sioc->fd, &msg, 0);
> > +    ret = sendmsg(sioc->fd, &msg, sflags);
> >      if (ret <= 0) {
> > -        if (errno == EAGAIN) {
> > +        switch (errno) {
> > +        case EAGAIN:
> >              return QIO_CHANNEL_ERR_BLOCK;
> > -        }
> > -        if (errno == EINTR) {
> > +        case EINTR:
> >              goto retry;
> > +        case ENOBUFS:
> > +            if (sflags & MSG_ZEROCOPY) {
> > +                error_setg_errno(errp, errno,
> > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > +                return -1;
> > +            }
>
> I have no idea whether it'll make a real differnece, but - should we better add
> a "break" here?

Here I followed the standard of the EAGAIN error, that's why I just returned -1.

IIUC A break here would cause the errp to be re-set to the default
message, after the switch.
Another option would be to add a 'default' clause, and move the
default error msg there, and return the -1
after the switch.

In the end I thought the current way was simpler, but it's no issue
to change if you think the 'default' idea would be better.

>  If you agree and with that fixed, feel free to add:
>
> Reviewed-by: Peter Xu <peterx@redhat.com>
>

Thanks!

> I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> here it's by default unlimited, but just curious when we should keep an eye.

It's unlimited if you run as root IIRC.

>
> Thanks,
>
> --
> Peter Xu
>
Leonardo Bras Soares Passos Jan. 19, 2022, 5:16 p.m. UTC | #8
On Thu, Jan 13, 2022 at 7:34 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Jan 13, 2022 at 10:06:14AM +0000, Daniel P. Berrangé wrote:
> > On Thu, Jan 13, 2022 at 02:48:15PM +0800, Peter Xu wrote:
> > > On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > > > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> > > >      }
> > > >
> > > > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > > > +        sflags = MSG_ZEROCOPY;
> > > > +    }
> > > > +
> > > >   retry:
> > > > -    ret = sendmsg(sioc->fd, &msg, 0);
> > > > +    ret = sendmsg(sioc->fd, &msg, sflags);
> > > >      if (ret <= 0) {
> > > > -        if (errno == EAGAIN) {
> > > > +        switch (errno) {
> > > > +        case EAGAIN:
> > > >              return QIO_CHANNEL_ERR_BLOCK;
> > > > -        }
> > > > -        if (errno == EINTR) {
> > > > +        case EINTR:
> > > >              goto retry;
> > > > +        case ENOBUFS:
> > > > +            if (sflags & MSG_ZEROCOPY) {
> > > > +                error_setg_errno(errp, errno,
> > > > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > > > +                return -1;
> > > > +            }
> > >
> > > I have no idea whether it'll make a real differnece, but - should we better add
> > > a "break" here?  If you agree and with that fixed, feel free to add:
> > >
> > > Reviewed-by: Peter Xu <peterx@redhat.com>
> > >
> > > I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> > > here it's by default unlimited, but just curious when we should keep an eye.
> >
> > Fedora doesn't allow unlimited locked memory by default
> >
> > $ grep "locked memory" /proc/self/limits
> > Max locked memory         65536                65536                bytes
> >
> > And  regardless of Fedora defaults, libvirt will set a limit
> > for the guest. It will only be unlimited if requiring certain
> > things like VFIO.
>
> Thanks, I obviously checked up the wrong host..
>
> Leo, do you know how much locked memory will be needed by zero copy?  Will
> there be a limit?  Is it linear to the number of sockets/channels?

It depends on the number of channels, of course, but there are
influencing factors, like network bandwidth & usage, and cpu speed &
usage, network queue size, VM pagesize and so on.

A simple exemple:
If the cpu is free/fast, but there are other applications using the
network, we may enqueue a lot of stuff for sending, and end up needing
a lot of locked memory.

I don't think it's easy to calculate a good reference value for locked
memory here.

>
> It'll be better if we can fail at enabling the feature when we detected that
> the specified locked memory limit may not be suffice.

I agree it's a good idea. But having this reference value calculated
is not much simple, IIUC.

>
> --
> Peter Xu
>
Leonardo Bras Soares Passos Jan. 19, 2022, 5:22 p.m. UTC | #9
Hello Daniel,

On Thu, Jan 13, 2022 at 7:42 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Thu, Jan 13, 2022 at 06:34:12PM +0800, Peter Xu wrote:
> > On Thu, Jan 13, 2022 at 10:06:14AM +0000, Daniel P. Berrangé wrote:
> > > On Thu, Jan 13, 2022 at 02:48:15PM +0800, Peter Xu wrote:
> > > > On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > > > > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> > > > >      }
> > > > >
> > > > > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > > > > +        sflags = MSG_ZEROCOPY;
> > > > > +    }
> > > > > +
> > > > >   retry:
> > > > > -    ret = sendmsg(sioc->fd, &msg, 0);
> > > > > +    ret = sendmsg(sioc->fd, &msg, sflags);
> > > > >      if (ret <= 0) {
> > > > > -        if (errno == EAGAIN) {
> > > > > +        switch (errno) {
> > > > > +        case EAGAIN:
> > > > >              return QIO_CHANNEL_ERR_BLOCK;
> > > > > -        }
> > > > > -        if (errno == EINTR) {
> > > > > +        case EINTR:
> > > > >              goto retry;
> > > > > +        case ENOBUFS:
> > > > > +            if (sflags & MSG_ZEROCOPY) {
> > > > > +                error_setg_errno(errp, errno,
> > > > > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > > > > +                return -1;
> > > > > +            }
> > > >
> > > > I have no idea whether it'll make a real differnece, but - should we better add
> > > > a "break" here?  If you agree and with that fixed, feel free to add:
> > > >
> > > > Reviewed-by: Peter Xu <peterx@redhat.com>
> > > >
> > > > I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> > > > here it's by default unlimited, but just curious when we should keep an eye.
> > >
> > > Fedora doesn't allow unlimited locked memory by default
> > >
> > > $ grep "locked memory" /proc/self/limits
> > > Max locked memory         65536                65536                bytes
> > >
> > > And  regardless of Fedora defaults, libvirt will set a limit
> > > for the guest. It will only be unlimited if requiring certain
> > > things like VFIO.
> >
> > Thanks, I obviously checked up the wrong host..
> >
> > Leo, do you know how much locked memory will be needed by zero copy?  Will
> > there be a limit?  Is it linear to the number of sockets/channels?
>
> IIRC we decided it would be limited by the socket send buffer size, rather
> than guest RAM, because writes will block once the send buffer is full.
>
> This has a default global setting, with per-socket override. On one box I
> have it is 200 Kb. With multifd you'll need  "num-sockets * send buffer".

Oh, I was not aware there is a send buffer size (or maybe I am unable
to recall).
That sure makes things much easier.

>
> > It'll be better if we can fail at enabling the feature when we detected that
> > the specified locked memory limit may not be suffice.

sure

>
> Checking this value against available locked memory though will always
> have an error margin because other things in QEMU can use locked memory
> too

We can get the current limit (before zerocopy) as an error margin:
req_lock_mem = num-sockets * send buffer + BASE_LOCKED

Where BASE_LOCKED is the current libvirt value, or so on.

What do you think?

Best regards,
Leo
Leonardo Bras Soares Passos Jan. 19, 2022, 5:23 p.m. UTC | #10
On Thu, Jan 13, 2022 at 9:12 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Jan 13, 2022 at 10:42:39AM +0000, Daniel P. Berrangé wrote:
> > On Thu, Jan 13, 2022 at 06:34:12PM +0800, Peter Xu wrote:
> > > On Thu, Jan 13, 2022 at 10:06:14AM +0000, Daniel P. Berrangé wrote:
> > > > On Thu, Jan 13, 2022 at 02:48:15PM +0800, Peter Xu wrote:
> > > > > On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > > > > > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > > >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> > > > > >      }
> > > > > >
> > > > > > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > > > > > +        sflags = MSG_ZEROCOPY;
> > > > > > +    }
> > > > > > +
> > > > > >   retry:
> > > > > > -    ret = sendmsg(sioc->fd, &msg, 0);
> > > > > > +    ret = sendmsg(sioc->fd, &msg, sflags);
> > > > > >      if (ret <= 0) {
> > > > > > -        if (errno == EAGAIN) {
> > > > > > +        switch (errno) {
> > > > > > +        case EAGAIN:
> > > > > >              return QIO_CHANNEL_ERR_BLOCK;
> > > > > > -        }
> > > > > > -        if (errno == EINTR) {
> > > > > > +        case EINTR:
> > > > > >              goto retry;
> > > > > > +        case ENOBUFS:
> > > > > > +            if (sflags & MSG_ZEROCOPY) {
> > > > > > +                error_setg_errno(errp, errno,
> > > > > > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > > > > > +                return -1;
> > > > > > +            }
> > > > >
> > > > > I have no idea whether it'll make a real differnece, but - should we better add
> > > > > a "break" here?  If you agree and with that fixed, feel free to add:
> > > > >
> > > > > Reviewed-by: Peter Xu <peterx@redhat.com>
> > > > >
> > > > > I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> > > > > here it's by default unlimited, but just curious when we should keep an eye.
> > > >
> > > > Fedora doesn't allow unlimited locked memory by default
> > > >
> > > > $ grep "locked memory" /proc/self/limits
> > > > Max locked memory         65536                65536                bytes
> > > >
> > > > And  regardless of Fedora defaults, libvirt will set a limit
> > > > for the guest. It will only be unlimited if requiring certain
> > > > things like VFIO.
> > >
> > > Thanks, I obviously checked up the wrong host..
> > >
> > > Leo, do you know how much locked memory will be needed by zero copy?  Will
> > > there be a limit?  Is it linear to the number of sockets/channels?
> >
> > IIRC we decided it would be limited by the socket send buffer size, rather
> > than guest RAM, because writes will block once the send buffer is full.
> >
> > This has a default global setting, with per-socket override. On one box I
> > have it is 200 Kb. With multifd you'll need  "num-sockets * send buffer".
> >
> > > It'll be better if we can fail at enabling the feature when we detected that
> > > the specified locked memory limit may not be suffice.
> >
> > Checking this value against available locked memory though will always
> > have an error margin because other things in QEMU can use locked memory
> > too
>
> We could always still allow false positive in this check, so we can fail if we
> have a solid clue to know we'll fail later (e.g. minimum locked_vm needed is
> already less than total).  But no strong opinion; we could have this merged and
> see whether that's needed in real life.  Thanks,

I agree, this is a good approach.

Leo
Leonardo Bras Jan. 19, 2022, 5:24 p.m. UTC | #11
On Thu, Jan 13, 2022 at 10:06 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > For CONFIG_LINUX, implement the new zero copy flag and the optional callback
> > io_flush on QIOChannelSocket, but enables it only when MSG_ZEROCOPY
> > feature is available in the host kernel, which is checked on
> > qio_channel_socket_connect_sync()
> >
> > qio_channel_socket_flush() was implemented by counting how many times
> > sendmsg(...,MSG_ZEROCOPY) was successfully called, and then reading the
> > socket's error queue, in order to find how many of them finished sending.
> > Flush will loop until those counters are the same, or until some error occurs.
> >
> > Notes on using writev() with QIO_CHANNEL_WRITE_FLAG_ZERO_COPY:
> > 1: Buffer
> > - As MSG_ZEROCOPY tells the kernel to use the same user buffer to avoid copying,
> > some caution is necessary to avoid overwriting any buffer before it's sent.
> > If something like this happen, a newer version of the buffer may be sent instead.
> > - If this is a problem, it's recommended to call qio_channel_flush() before freeing
> > or re-using the buffer.
> >
> > 2: Locked memory
> > - When using MSG_ZERCOCOPY, the buffer memory will be locked after queued, and
> > unlocked after it's sent.
> > - Depending on the size of each buffer, and how often it's sent, it may require
> > a larger amount of locked memory than usually available to non-root user.
> > - If the required amount of locked memory is not available, writev_zero_copy
> > will return an error, which can abort an operation like migration,
> > - Because of this, when an user code wants to add zero copy as a feature, it
> > requires a mechanism to disable it, so it can still be accessible to less
> > privileged users.
> >
> > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > ---
> >  include/io/channel-socket.h |   2 +
> >  io/channel-socket.c         | 107 ++++++++++++++++++++++++++++++++++--
> >  2 files changed, 105 insertions(+), 4 deletions(-)
>
> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
>

Thanks!

>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>
Peter Xu Jan. 20, 2022, 1:28 a.m. UTC | #12
On Wed, Jan 19, 2022 at 02:22:56PM -0300, Leonardo Bras Soares Passos wrote:
> Hello Daniel,
> 
> On Thu, Jan 13, 2022 at 7:42 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
> >
> > On Thu, Jan 13, 2022 at 06:34:12PM +0800, Peter Xu wrote:
> > > On Thu, Jan 13, 2022 at 10:06:14AM +0000, Daniel P. Berrangé wrote:
> > > > On Thu, Jan 13, 2022 at 02:48:15PM +0800, Peter Xu wrote:
> > > > > On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > > > > > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > > >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> > > > > >      }
> > > > > >
> > > > > > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > > > > > +        sflags = MSG_ZEROCOPY;
> > > > > > +    }
> > > > > > +
> > > > > >   retry:
> > > > > > -    ret = sendmsg(sioc->fd, &msg, 0);
> > > > > > +    ret = sendmsg(sioc->fd, &msg, sflags);
> > > > > >      if (ret <= 0) {
> > > > > > -        if (errno == EAGAIN) {
> > > > > > +        switch (errno) {
> > > > > > +        case EAGAIN:
> > > > > >              return QIO_CHANNEL_ERR_BLOCK;
> > > > > > -        }
> > > > > > -        if (errno == EINTR) {
> > > > > > +        case EINTR:
> > > > > >              goto retry;
> > > > > > +        case ENOBUFS:
> > > > > > +            if (sflags & MSG_ZEROCOPY) {
> > > > > > +                error_setg_errno(errp, errno,
> > > > > > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > > > > > +                return -1;
> > > > > > +            }
> > > > >
> > > > > I have no idea whether it'll make a real differnece, but - should we better add
> > > > > a "break" here?  If you agree and with that fixed, feel free to add:
> > > > >
> > > > > Reviewed-by: Peter Xu <peterx@redhat.com>
> > > > >
> > > > > I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> > > > > here it's by default unlimited, but just curious when we should keep an eye.
> > > >
> > > > Fedora doesn't allow unlimited locked memory by default
> > > >
> > > > $ grep "locked memory" /proc/self/limits
> > > > Max locked memory         65536                65536                bytes
> > > >
> > > > And  regardless of Fedora defaults, libvirt will set a limit
> > > > for the guest. It will only be unlimited if requiring certain
> > > > things like VFIO.
> > >
> > > Thanks, I obviously checked up the wrong host..
> > >
> > > Leo, do you know how much locked memory will be needed by zero copy?  Will
> > > there be a limit?  Is it linear to the number of sockets/channels?
> >
> > IIRC we decided it would be limited by the socket send buffer size, rather
> > than guest RAM, because writes will block once the send buffer is full.
> >
> > This has a default global setting, with per-socket override. On one box I
> > have it is 200 Kb. With multifd you'll need  "num-sockets * send buffer".
> 
> Oh, I was not aware there is a send buffer size (or maybe I am unable
> to recall).
> That sure makes things much easier.
> 
> >
> > > It'll be better if we can fail at enabling the feature when we detected that
> > > the specified locked memory limit may not be suffice.
> 
> sure
> 
> >
> > Checking this value against available locked memory though will always
> > have an error margin because other things in QEMU can use locked memory
> > too
> 
> We can get the current limit (before zerocopy) as an error margin:
> req_lock_mem = num-sockets * send buffer + BASE_LOCKED
> 
> Where BASE_LOCKED is the current libvirt value, or so on.

Hmm.. not familiar with libvirt, so I'm curious whether libvirt is actually
enlarging the allowed locked mem on Fedora since the default is 64KB?

I think it'll be great to capture the very major going-to-fail scenarios.  For
example, I'm wondering whether a qemu (without libvirt) will simply fail
directly on Fedora using non-root even with 1 channel due to the 64K limit, or
the other extreme case is when the user does not allow locking mem at all in
some container environment (when we see max locked mem is zero).

It's not only about failing early, it's also about failing with a meaningful
error so the user knows what to tune, while I'm not very sure that'll be easily
understandable when we wait until the failure of io_writev().

Thanks,
Leonardo Bras Feb. 1, 2022, 4:23 a.m. UTC | #13
Hello Peter,

Re-reading everything before submitting the next version.
I think I finally got that you are suggesting to just add a break at
the end of the case, after the if :)

Sorry I misunderstand that before,

Best regards,
Leo

On Thu, Jan 13, 2022 at 3:48 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Jan 06, 2022 at 07:13:39PM -0300, Leonardo Bras wrote:
> > @@ -558,15 +575,26 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> >          memcpy(CMSG_DATA(cmsg), fds, fdsize);
> >      }
> >
> > +    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
> > +        sflags = MSG_ZEROCOPY;
> > +    }
> > +
> >   retry:
> > -    ret = sendmsg(sioc->fd, &msg, 0);
> > +    ret = sendmsg(sioc->fd, &msg, sflags);
> >      if (ret <= 0) {
> > -        if (errno == EAGAIN) {
> > +        switch (errno) {
> > +        case EAGAIN:
> >              return QIO_CHANNEL_ERR_BLOCK;
> > -        }
> > -        if (errno == EINTR) {
> > +        case EINTR:
> >              goto retry;
> > +        case ENOBUFS:
> > +            if (sflags & MSG_ZEROCOPY) {
> > +                error_setg_errno(errp, errno,
> > +                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
> > +                return -1;
> > +            }
>
> I have no idea whether it'll make a real differnece, but - should we better add
> a "break" here?  If you agree and with that fixed, feel free to add:
>
> Reviewed-by: Peter Xu <peterx@redhat.com>
>
> I also wonder whether you hit ENOBUFS in any of the environments.  On Fedora
> here it's by default unlimited, but just curious when we should keep an eye.
>
> Thanks,
>
> --
> Peter Xu
>
diff mbox series

Patch

diff --git a/include/io/channel-socket.h b/include/io/channel-socket.h
index e747e63514..513c428fe4 100644
--- a/include/io/channel-socket.h
+++ b/include/io/channel-socket.h
@@ -47,6 +47,8 @@  struct QIOChannelSocket {
     socklen_t localAddrLen;
     struct sockaddr_storage remoteAddr;
     socklen_t remoteAddrLen;
+    ssize_t zero_copy_queued;
+    ssize_t zero_copy_sent;
 };
 
 
diff --git a/io/channel-socket.c b/io/channel-socket.c
index bfbd64787e..fb1e210ec5 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -26,6 +26,10 @@ 
 #include "io/channel-watch.h"
 #include "trace.h"
 #include "qapi/clone-visitor.h"
+#ifdef CONFIG_LINUX
+#include <linux/errqueue.h>
+#include <bits/socket.h>
+#endif
 
 #define SOCKET_MAX_FDS 16
 
@@ -55,6 +59,8 @@  qio_channel_socket_new(void)
 
     sioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET));
     sioc->fd = -1;
+    sioc->zero_copy_queued = 0;
+    sioc->zero_copy_sent = 0;
 
     ioc = QIO_CHANNEL(sioc);
     qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN);
@@ -154,6 +160,16 @@  int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
         return -1;
     }
 
+#ifdef CONFIG_LINUX
+    int ret, v = 1;
+    ret = qemu_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &v, sizeof(v));
+    if (ret == 0) {
+        /* Zero copy available on host */
+        qio_channel_set_feature(QIO_CHANNEL(ioc),
+                                QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY);
+    }
+#endif
+
     return 0;
 }
 
@@ -534,6 +550,7 @@  static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
     char control[CMSG_SPACE(sizeof(int) * SOCKET_MAX_FDS)];
     size_t fdsize = sizeof(int) * nfds;
     struct cmsghdr *cmsg;
+    int sflags = 0;
 
     memset(control, 0, CMSG_SPACE(sizeof(int) * SOCKET_MAX_FDS));
 
@@ -558,15 +575,26 @@  static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
         memcpy(CMSG_DATA(cmsg), fds, fdsize);
     }
 
+    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
+        sflags = MSG_ZEROCOPY;
+    }
+
  retry:
-    ret = sendmsg(sioc->fd, &msg, 0);
+    ret = sendmsg(sioc->fd, &msg, sflags);
     if (ret <= 0) {
-        if (errno == EAGAIN) {
+        switch (errno) {
+        case EAGAIN:
             return QIO_CHANNEL_ERR_BLOCK;
-        }
-        if (errno == EINTR) {
+        case EINTR:
             goto retry;
+        case ENOBUFS:
+            if (sflags & MSG_ZEROCOPY) {
+                error_setg_errno(errp, errno,
+                                 "Process can't lock enough memory for using MSG_ZEROCOPY");
+                return -1;
+            }
         }
+
         error_setg_errno(errp, errno,
                          "Unable to write to socket");
         return -1;
@@ -660,6 +688,74 @@  static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
 }
 #endif /* WIN32 */
 
+
+#ifdef CONFIG_LINUX
+static int qio_channel_socket_flush(QIOChannel *ioc,
+                                    Error **errp)
+{
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    struct msghdr msg = {};
+    struct sock_extended_err *serr;
+    struct cmsghdr *cm;
+    char control[CMSG_SPACE(sizeof(*serr))];
+    int received;
+    int ret = 1;
+
+    msg.msg_control = control;
+    msg.msg_controllen = sizeof(control);
+    memset(control, 0, sizeof(control));
+
+    while (sioc->zero_copy_sent < sioc->zero_copy_queued) {
+        received = recvmsg(sioc->fd, &msg, MSG_ERRQUEUE);
+        if (received < 0) {
+            switch (errno) {
+            case EAGAIN:
+                /* Nothing on errqueue, wait until something is available */
+                qio_channel_wait(ioc, G_IO_ERR);
+                continue;
+            case EINTR:
+                continue;
+            default:
+                error_setg_errno(errp, errno,
+                                 "Unable to read errqueue");
+                return -1;
+            }
+        }
+
+        cm = CMSG_FIRSTHDR(&msg);
+        if (cm->cmsg_level != SOL_IP &&
+            cm->cmsg_type != IP_RECVERR) {
+            error_setg_errno(errp, EPROTOTYPE,
+                             "Wrong cmsg in errqueue");
+            return -1;
+        }
+
+        serr = (void *) CMSG_DATA(cm);
+        if (serr->ee_errno != SO_EE_ORIGIN_NONE) {
+            error_setg_errno(errp, serr->ee_errno,
+                             "Error on socket");
+            return -1;
+        }
+        if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
+            error_setg_errno(errp, serr->ee_origin,
+                             "Error not from zero copy");
+            return -1;
+        }
+
+        /* No errors, count successfully finished sendmsg()*/
+        sioc->zero_copy_sent += serr->ee_data - serr->ee_info + 1;
+
+        /* If any sendmsg() succeeded using zero copy, return 0 at the end */
+        if (serr->ee_code != SO_EE_CODE_ZEROCOPY_COPIED) {
+            ret = 0;
+        }
+    }
+
+    return ret;
+}
+
+#endif /* CONFIG_LINUX */
+
 static int
 qio_channel_socket_set_blocking(QIOChannel *ioc,
                                 bool enabled,
@@ -789,6 +885,9 @@  static void qio_channel_socket_class_init(ObjectClass *klass,
     ioc_klass->io_set_delay = qio_channel_socket_set_delay;
     ioc_klass->io_create_watch = qio_channel_socket_create_watch;
     ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
+#ifdef CONFIG_LINUX
+    ioc_klass->io_flush = qio_channel_socket_flush;
+#endif
 }
 
 static const TypeInfo qio_channel_socket_info = {