diff mbox series

[v7,1/5] QIOChannel: Add flags on io_writev and introduce io_flush callback

Message ID 20220106221341.8779-2-leobras@redhat.com (mailing list archive)
State New, archived
Headers show
Series MSG_ZEROCOPY + multifd | expand

Commit Message

Leonardo Bras Jan. 6, 2022, 10:13 p.m. UTC
Add flags to io_writev and introduce io_flush as optional callback to
QIOChannelClass, allowing the implementation of zero copy writes by
subclasses.

How to use them:
- Write data using qio_channel_writev(...,QIO_CHANNEL_WRITE_FLAG_ZERO_COPY),
- Wait write completion with qio_channel_flush().

Notes:
As some zero copy implementations work asynchronously, it's
recommended to keep the write buffer untouched until the return of
qio_channel_flush(), to avoid the risk of sending an updated buffer
instead of the buffer state during write.

As io_flush callback is optional, if a subclass does not implement it, then:
- io_flush will return 0 without changing anything.

Also, some functions like qio_channel_writev_full_all() were adapted to
receive a flag parameter. That allows shared code between zero copy and
non-zero copy writev, and also an easier implementation on new flags.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
---
 include/io/channel.h | 67 +++++++++++++++++++++++++++++++++++---------
 io/channel-buffer.c  |  1 +
 io/channel-command.c |  1 +
 io/channel-file.c    |  1 +
 io/channel-socket.c  |  2 ++
 io/channel-tls.c     |  1 +
 io/channel-websock.c |  1 +
 io/channel.c         | 51 +++++++++++++++++++++++----------
 migration/rdma.c     |  1 +
 9 files changed, 98 insertions(+), 28 deletions(-)

Comments

Peter Xu Jan. 13, 2022, 6:28 a.m. UTC | #1
On Thu, Jan 06, 2022 at 07:13:38PM -0300, Leonardo Bras wrote:
> diff --git a/io/channel.c b/io/channel.c
> index e8b019dc36..904855e16e 100644
> --- a/io/channel.c
> +++ b/io/channel.c
> @@ -67,12 +67,13 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
>  }
>  
>  
> -ssize_t qio_channel_writev_full(QIOChannel *ioc,
> -                                const struct iovec *iov,
> -                                size_t niov,
> -                                int *fds,
> -                                size_t nfds,
> -                                Error **errp)
> +ssize_t qio_channel_writev_full_flags(QIOChannel *ioc,
> +                                      const struct iovec *iov,
> +                                      size_t niov,
> +                                      int *fds,
> +                                      size_t nfds,
> +                                      int flags,
> +                                      Error **errp)
>  {
>      QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
>  
> @@ -83,7 +84,7 @@ ssize_t qio_channel_writev_full(QIOChannel *ioc,
>          return -1;
>      }

Should we better also check QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY here when
QIO_CHANNEL_WRITE_FLAG_ZERO_COPY is set?  Just like what we do with:

    if ((fds || nfds) &&
        !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
        error_setg_errno(errp, EINVAL,
                         "Channel does not support file descriptor passing");
        return -1;
    }

I still think it's better to have the caller be crystal clear when to use
zero_copy feature because it has implication on buffer lifetime.

I might have commented similar things before, but I have missed a few versions
so I could also have missed some previous discussions..

>  
> -    return klass->io_writev(ioc, iov, niov, fds, nfds, errp);
> +    return klass->io_writev(ioc, iov, niov, fds, nfds, flags, errp);
>  }
Daniel P. Berrangé Jan. 13, 2022, 10:52 a.m. UTC | #2
On Thu, Jan 06, 2022 at 07:13:38PM -0300, Leonardo Bras wrote:
> Add flags to io_writev and introduce io_flush as optional callback to
> QIOChannelClass, allowing the implementation of zero copy writes by
> subclasses.
> 
> How to use them:
> - Write data using qio_channel_writev(...,QIO_CHANNEL_WRITE_FLAG_ZERO_COPY),
> - Wait write completion with qio_channel_flush().
> 
> Notes:
> As some zero copy implementations work asynchronously, it's
> recommended to keep the write buffer untouched until the return of
> qio_channel_flush(), to avoid the risk of sending an updated buffer
> instead of the buffer state during write.
> 
> As io_flush callback is optional, if a subclass does not implement it, then:
> - io_flush will return 0 without changing anything.
> 
> Also, some functions like qio_channel_writev_full_all() were adapted to
> receive a flag parameter. That allows shared code between zero copy and
> non-zero copy writev, and also an easier implementation on new flags.
> 
> Signed-off-by: Leonardo Bras <leobras@redhat.com>
> ---
>  include/io/channel.h | 67 +++++++++++++++++++++++++++++++++++---------
>  io/channel-buffer.c  |  1 +
>  io/channel-command.c |  1 +
>  io/channel-file.c    |  1 +
>  io/channel-socket.c  |  2 ++
>  io/channel-tls.c     |  1 +
>  io/channel-websock.c |  1 +
>  io/channel.c         | 51 +++++++++++++++++++++++----------
>  migration/rdma.c     |  1 +
>  9 files changed, 98 insertions(+), 28 deletions(-)
> 
> diff --git a/include/io/channel.h b/include/io/channel.h
> index 88988979f8..343766ce5b 100644
> --- a/include/io/channel.h
> +++ b/include/io/channel.h
> @@ -32,12 +32,15 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
>  
>  #define QIO_CHANNEL_ERR_BLOCK -2
>  
> +#define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
> +
>  typedef enum QIOChannelFeature QIOChannelFeature;
>  
>  enum QIOChannelFeature {
>      QIO_CHANNEL_FEATURE_FD_PASS,
>      QIO_CHANNEL_FEATURE_SHUTDOWN,
>      QIO_CHANNEL_FEATURE_LISTEN,
> +    QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
>  };
>  
>  
> @@ -104,6 +107,7 @@ struct QIOChannelClass {
>                           size_t niov,
>                           int *fds,
>                           size_t nfds,
> +                         int flags,
>                           Error **errp);
>      ssize_t (*io_readv)(QIOChannel *ioc,
>                          const struct iovec *iov,
> @@ -136,6 +140,8 @@ struct QIOChannelClass {
>                                    IOHandler *io_read,
>                                    IOHandler *io_write,
>                                    void *opaque);
> +    int (*io_flush)(QIOChannel *ioc,
> +                    Error **errp);
>  };
>  
>  /* General I/O handling functions */
> @@ -222,12 +228,13 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
>  
>  
>  /**
> - * qio_channel_writev_full:
> + * qio_channel_writev_full_flags:
>   * @ioc: the channel object
>   * @iov: the array of memory regions to write data from
>   * @niov: the length of the @iov array
>   * @fds: an array of file handles to send
>   * @nfds: number of file handles in @fds
> + * @flags: write flags (QIO_CHANNEL_WRITE_FLAG_*)
>   * @errp: pointer to a NULL-initialized error object
>   *
>   * Write data to the IO channel, reading it from the
> @@ -255,12 +262,16 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
>   * or QIO_CHANNEL_ERR_BLOCK if no data is can be sent
>   * and the channel is non-blocking
>   */
> -ssize_t qio_channel_writev_full(QIOChannel *ioc,
> -                                const struct iovec *iov,
> -                                size_t niov,
> -                                int *fds,
> -                                size_t nfds,
> -                                Error **errp);
> +ssize_t qio_channel_writev_full_flags(QIOChannel *ioc,
> +                                      const struct iovec *iov,
> +                                      size_t niov,
> +                                      int *fds,
> +                                      size_t nfds,
> +                                      int flags,
> +                                      Error **errp);
> +
> +#define qio_channel_writev_full(ioc, iov, niov, fds, nfds, errp) \
> +    qio_channel_writev_full_flags(ioc, iov, niov, fds, nfds, 0, errp)

Don't introduce yet another API variant here. Just add flags to
all the existing write APIs with "full" in their name. The word
"full" in their name was intended to indicate that they are
accepting all possible parameters, so it doesn't mean sense to
add APIs which take even more possible parameters.

> +int qio_channel_writev_full_all_flags(QIOChannel *ioc,
> +                                      const struct iovec *iov,
> +                                      size_t niov,
> +                                      int *fds, size_t nfds,
> +                                      int flags, Error **errp);
> +#define qio_channel_writev_full_all(ioc, iov, niov, fds, nfds, errp) \
> +    qio_channel_writev_full_all_flags(ioc, iov, niov, fds, nfds, 0, errp)

Same note.

> +/**
> + * qio_channel_flush:
> + * @ioc: the channel object
> + * @errp: pointer to a NULL-initialized error object
> + *
> + * Will block until every packet queued with
> + * qio_channel_writev_full_flags() + QIO_CHANNEL_WRITE_FLAG_ZERO_COPY
> + * is sent, or return in case of any error.
> + *
> + * If not implemented, acts as a no-op, and returns 0.
> + *
> + * Returns -1 if any error is found,
> + *          1 if every send failed to use zero copy.
> + *          0 otherwise.
> + */
> +
> +int qio_channel_flush(QIOChannel *ioc,
> +                      Error **errp);

Regards,
Daniel
Leonardo Bras Jan. 18, 2022, 8:45 p.m. UTC | #3
Hello Peter,

On Thu, Jan 13, 2022 at 3:28 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Jan 06, 2022 at 07:13:38PM -0300, Leonardo Bras wrote:
> > diff --git a/io/channel.c b/io/channel.c
> > index e8b019dc36..904855e16e 100644
> > --- a/io/channel.c
> > +++ b/io/channel.c
> > @@ -67,12 +67,13 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
> >  }
> >
> >
> > -ssize_t qio_channel_writev_full(QIOChannel *ioc,
> > -                                const struct iovec *iov,
> > -                                size_t niov,
> > -                                int *fds,
> > -                                size_t nfds,
> > -                                Error **errp)
> > +ssize_t qio_channel_writev_full_flags(QIOChannel *ioc,
> > +                                      const struct iovec *iov,
> > +                                      size_t niov,
> > +                                      int *fds,
> > +                                      size_t nfds,
> > +                                      int flags,
> > +                                      Error **errp)
> >  {
> >      QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
> >
> > @@ -83,7 +84,7 @@ ssize_t qio_channel_writev_full(QIOChannel *ioc,
> >          return -1;
> >      }
>
> Should we better also check QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY here when
> QIO_CHANNEL_WRITE_FLAG_ZERO_COPY is set?  Just like what we do with:

Yes, that's correct.
I will also test for fds + zerocopy_flag , which should also fail here.

>
>     if ((fds || nfds) &&
>         !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
>         error_setg_errno(errp, EINVAL,
>                          "Channel does not support file descriptor passing");
>         return -1;
>     }
>
> I still think it's better to have the caller be crystal clear when to use
> zero_copy feature because it has implication on buffer lifetime.

I don't disagree with that suggestion.

But the buffer lifetime limitation is something on the socket
implementation, right?
There could be some synchronous zerocopy implementation that does not
require flush, and thus
don't require the buffer to be treated any special. Or am I missing something?

>
> I might have commented similar things before, but I have missed a few versions
> so I could also have missed some previous discussions..
>

That's all great suggestions Peter!  Thanks for that!

Some of the previous suggestions may have been missed because a lot of
code moved.
Sorry about that.

Best regards,
Leo
Peter Xu Jan. 19, 2022, 1:58 a.m. UTC | #4
On Tue, Jan 18, 2022 at 05:45:09PM -0300, Leonardo Bras Soares Passos wrote:
> Hello Peter,
> 
> On Thu, Jan 13, 2022 at 3:28 AM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Thu, Jan 06, 2022 at 07:13:38PM -0300, Leonardo Bras wrote:
> > > diff --git a/io/channel.c b/io/channel.c
> > > index e8b019dc36..904855e16e 100644
> > > --- a/io/channel.c
> > > +++ b/io/channel.c
> > > @@ -67,12 +67,13 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
> > >  }
> > >
> > >
> > > -ssize_t qio_channel_writev_full(QIOChannel *ioc,
> > > -                                const struct iovec *iov,
> > > -                                size_t niov,
> > > -                                int *fds,
> > > -                                size_t nfds,
> > > -                                Error **errp)
> > > +ssize_t qio_channel_writev_full_flags(QIOChannel *ioc,
> > > +                                      const struct iovec *iov,
> > > +                                      size_t niov,
> > > +                                      int *fds,
> > > +                                      size_t nfds,
> > > +                                      int flags,
> > > +                                      Error **errp)
> > >  {
> > >      QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
> > >
> > > @@ -83,7 +84,7 @@ ssize_t qio_channel_writev_full(QIOChannel *ioc,
> > >          return -1;
> > >      }
> >
> > Should we better also check QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY here when
> > QIO_CHANNEL_WRITE_FLAG_ZERO_COPY is set?  Just like what we do with:
> 
> Yes, that's correct.
> I will also test for fds + zerocopy_flag , which should also fail here.
> 
> >
> >     if ((fds || nfds) &&
> >         !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
> >         error_setg_errno(errp, EINVAL,
> >                          "Channel does not support file descriptor passing");
> >         return -1;
> >     }
> >
> > I still think it's better to have the caller be crystal clear when to use
> > zero_copy feature because it has implication on buffer lifetime.
> 
> I don't disagree with that suggestion.
> 
> But the buffer lifetime limitation is something on the socket
> implementation, right?
> There could be some synchronous zerocopy implementation that does not
> require flush, and thus
> don't require the buffer to be treated any special. Or am I missing something?

Currently the flush() is required for zerocopy and not required for all the
existing non-zerocopy use cases, that's already an API difference so the caller
needs to identify it anyway.  Then I think it's simpler we expose all of it to
the user.

Not to mention IIUC if we don't fail here, it will just fail later when the
code will unconditionally convert the flags=ZEROCOPY into MSG_ZEROCOPY in your
next patch:

    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
        sflags = MSG_ZEROCOPY;
    }

So AFAIU it'll fail anyway, either here with the cap check I mentioned, or
later in sendmsg().

IOW, I think it fails cleaner here, rather than reaching sendmsg().

> 
> >
> > I might have commented similar things before, but I have missed a few versions
> > so I could also have missed some previous discussions..
> >
> 
> That's all great suggestions Peter!  Thanks for that!
> 
> Some of the previous suggestions may have been missed because a lot of
> code moved.
> Sorry about that.

Not a problem at all, I just want to make sure my question still makes
sense. :)
Leonardo Bras Jan. 19, 2022, 4:38 a.m. UTC | #5
Hello Daniel,

On Thu, Jan 13, 2022 at 7:53 AM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Thu, Jan 06, 2022 at 07:13:38PM -0300, Leonardo Bras wrote:
> > Add flags to io_writev and introduce io_flush as optional callback to
> > QIOChannelClass, allowing the implementation of zero copy writes by
> > subclasses.
> >
> > How to use them:
> > - Write data using qio_channel_writev(...,QIO_CHANNEL_WRITE_FLAG_ZERO_COPY),
> > - Wait write completion with qio_channel_flush().
> >
> > Notes:
> > As some zero copy implementations work asynchronously, it's
> > recommended to keep the write buffer untouched until the return of
> > qio_channel_flush(), to avoid the risk of sending an updated buffer
> > instead of the buffer state during write.
> >
> > As io_flush callback is optional, if a subclass does not implement it, then:
> > - io_flush will return 0 without changing anything.
> >
> > Also, some functions like qio_channel_writev_full_all() were adapted to
> > receive a flag parameter. That allows shared code between zero copy and
> > non-zero copy writev, and also an easier implementation on new flags.
> >
> > Signed-off-by: Leonardo Bras <leobras@redhat.com>
> > ---
> >  include/io/channel.h | 67 +++++++++++++++++++++++++++++++++++---------
> >  io/channel-buffer.c  |  1 +
> >  io/channel-command.c |  1 +
> >  io/channel-file.c    |  1 +
> >  io/channel-socket.c  |  2 ++
> >  io/channel-tls.c     |  1 +
> >  io/channel-websock.c |  1 +
> >  io/channel.c         | 51 +++++++++++++++++++++++----------
> >  migration/rdma.c     |  1 +
> >  9 files changed, 98 insertions(+), 28 deletions(-)
> >
> > diff --git a/include/io/channel.h b/include/io/channel.h
> > index 88988979f8..343766ce5b 100644
> > --- a/include/io/channel.h
> > +++ b/include/io/channel.h
> > @@ -32,12 +32,15 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
> >
> >  #define QIO_CHANNEL_ERR_BLOCK -2
> >
> > +#define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
> > +
> >  typedef enum QIOChannelFeature QIOChannelFeature;
> >
> >  enum QIOChannelFeature {
> >      QIO_CHANNEL_FEATURE_FD_PASS,
> >      QIO_CHANNEL_FEATURE_SHUTDOWN,
> >      QIO_CHANNEL_FEATURE_LISTEN,
> > +    QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
> >  };
> >
> >
> > @@ -104,6 +107,7 @@ struct QIOChannelClass {
> >                           size_t niov,
> >                           int *fds,
> >                           size_t nfds,
> > +                         int flags,
> >                           Error **errp);
> >      ssize_t (*io_readv)(QIOChannel *ioc,
> >                          const struct iovec *iov,
> > @@ -136,6 +140,8 @@ struct QIOChannelClass {
> >                                    IOHandler *io_read,
> >                                    IOHandler *io_write,
> >                                    void *opaque);
> > +    int (*io_flush)(QIOChannel *ioc,
> > +                    Error **errp);
> >  };
> >
> >  /* General I/O handling functions */
> > @@ -222,12 +228,13 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
> >
> >
> >  /**
> > - * qio_channel_writev_full:
> > + * qio_channel_writev_full_flags:
> >   * @ioc: the channel object
> >   * @iov: the array of memory regions to write data from
> >   * @niov: the length of the @iov array
> >   * @fds: an array of file handles to send
> >   * @nfds: number of file handles in @fds
> > + * @flags: write flags (QIO_CHANNEL_WRITE_FLAG_*)
> >   * @errp: pointer to a NULL-initialized error object
> >   *
> >   * Write data to the IO channel, reading it from the
> > @@ -255,12 +262,16 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
> >   * or QIO_CHANNEL_ERR_BLOCK if no data is can be sent
> >   * and the channel is non-blocking
> >   */
> > -ssize_t qio_channel_writev_full(QIOChannel *ioc,
> > -                                const struct iovec *iov,
> > -                                size_t niov,
> > -                                int *fds,
> > -                                size_t nfds,
> > -                                Error **errp);
> > +ssize_t qio_channel_writev_full_flags(QIOChannel *ioc,
> > +                                      const struct iovec *iov,
> > +                                      size_t niov,
> > +                                      int *fds,
> > +                                      size_t nfds,
> > +                                      int flags,
> > +                                      Error **errp);
> > +
> > +#define qio_channel_writev_full(ioc, iov, niov, fds, nfds, errp) \
> > +    qio_channel_writev_full_flags(ioc, iov, niov, fds, nfds, 0, errp)
>
> Don't introduce yet another API variant here. Just add flags to
> all the existing write APIs with "full" in their name. The word
> "full" in their name was intended to indicate that they are
> accepting all possible parameters, so it doesn't mean sense to
> add APIs which take even more possible parameters.

Oh, I was not aware of that. Thanks for letting me know!

Sure, I will do this change for v8.


>
> > +int qio_channel_writev_full_all_flags(QIOChannel *ioc,
> > +                                      const struct iovec *iov,
> > +                                      size_t niov,
> > +                                      int *fds, size_t nfds,
> > +                                      int flags, Error **errp);
> > +#define qio_channel_writev_full_all(ioc, iov, niov, fds, nfds, errp) \
> > +    qio_channel_writev_full_all_flags(ioc, iov, niov, fds, nfds, 0, errp)
>
> Same note.
>
> > +/**
> > + * qio_channel_flush:
> > + * @ioc: the channel object
> > + * @errp: pointer to a NULL-initialized error object
> > + *
> > + * Will block until every packet queued with
> > + * qio_channel_writev_full_flags() + QIO_CHANNEL_WRITE_FLAG_ZERO_COPY
> > + * is sent, or return in case of any error.
> > + *
> > + * If not implemented, acts as a no-op, and returns 0.
> > + *
> > + * Returns -1 if any error is found,
> > + *          1 if every send failed to use zero copy.
> > + *          0 otherwise.
> > + */
> > +
> > +int qio_channel_flush(QIOChannel *ioc,
> > +                      Error **errp);
>
>

Best regards,
Leo
Leonardo Bras Jan. 19, 2022, 6:29 p.m. UTC | #6
On Tue, Jan 18, 2022 at 10:58 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Jan 18, 2022 at 05:45:09PM -0300, Leonardo Bras Soares Passos wrote:
> > Hello Peter,
> >
> > On Thu, Jan 13, 2022 at 3:28 AM Peter Xu <peterx@redhat.com> wrote:
> > >
> > > On Thu, Jan 06, 2022 at 07:13:38PM -0300, Leonardo Bras wrote:
> > > > diff --git a/io/channel.c b/io/channel.c
> > > > index e8b019dc36..904855e16e 100644
> > > > --- a/io/channel.c
> > > > +++ b/io/channel.c
> > > > @@ -67,12 +67,13 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
> > > >  }
> > > >
> > > >
> > > > -ssize_t qio_channel_writev_full(QIOChannel *ioc,
> > > > -                                const struct iovec *iov,
> > > > -                                size_t niov,
> > > > -                                int *fds,
> > > > -                                size_t nfds,
> > > > -                                Error **errp)
> > > > +ssize_t qio_channel_writev_full_flags(QIOChannel *ioc,
> > > > +                                      const struct iovec *iov,
> > > > +                                      size_t niov,
> > > > +                                      int *fds,
> > > > +                                      size_t nfds,
> > > > +                                      int flags,
> > > > +                                      Error **errp)
> > > >  {
> > > >      QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
> > > >
> > > > @@ -83,7 +84,7 @@ ssize_t qio_channel_writev_full(QIOChannel *ioc,
> > > >          return -1;
> > > >      }
> > >
> > > Should we better also check QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY here when
> > > QIO_CHANNEL_WRITE_FLAG_ZERO_COPY is set?  Just like what we do with:
> >
> > Yes, that's correct.
> > I will also test for fds + zerocopy_flag , which should also fail here.
> >
> > >
> > >     if ((fds || nfds) &&
> > >         !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
> > >         error_setg_errno(errp, EINVAL,
> > >                          "Channel does not support file descriptor passing");
> > >         return -1;
> > >     }
> > >
> > > I still think it's better to have the caller be crystal clear when to use
> > > zero_copy feature because it has implication on buffer lifetime.
> >
> > I don't disagree with that suggestion.
> >
> > But the buffer lifetime limitation is something on the socket
> > implementation, right?
> > There could be some synchronous zerocopy implementation that does not
> > require flush, and thus
> > don't require the buffer to be treated any special. Or am I missing something?
>
> Currently the flush() is required for zerocopy and not required for all the
> existing non-zerocopy use cases, that's already an API difference so the caller
> needs to identify it anyway.  Then I think it's simpler we expose all of it to
> the user.

Yeah, I agree.
Since one ZC implementation uses flush, all should use them. Even if
it's a no-op.
It was just an observation that not all ZC implementations have buffer
limitations, but I agree the user should expect them anyway, since
they will exist in some implementations.

>
> Not to mention IIUC if we don't fail here, it will just fail later when the
> code will unconditionally convert the flags=ZEROCOPY into MSG_ZEROCOPY in your
> next patch:
>
>     if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
>         sflags = MSG_ZEROCOPY;
>     }
>

Correct.

> So AFAIU it'll fail anyway, either here with the cap check I mentioned, or
> later in sendmsg().
>
> IOW, I think it fails cleaner here, rather than reaching sendmsg().

I Agree.

>
> >
> > >
> > > I might have commented similar things before, but I have missed a few versions
> > > so I could also have missed some previous discussions..
> > >
> >
> > That's all great suggestions Peter!  Thanks for that!
> >
> > Some of the previous suggestions may have been missed because a lot of
> > code moved.
> > Sorry about that.
>
> Not a problem at all, I just want to make sure my question still makes
> sense. :)

Thanks for asking them!

>
> --
> Peter Xu
>

Best regards,
Leo
diff mbox series

Patch

diff --git a/include/io/channel.h b/include/io/channel.h
index 88988979f8..343766ce5b 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -32,12 +32,15 @@  OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 
 #define QIO_CHANNEL_ERR_BLOCK -2
 
+#define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
+
 typedef enum QIOChannelFeature QIOChannelFeature;
 
 enum QIOChannelFeature {
     QIO_CHANNEL_FEATURE_FD_PASS,
     QIO_CHANNEL_FEATURE_SHUTDOWN,
     QIO_CHANNEL_FEATURE_LISTEN,
+    QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
 };
 
 
@@ -104,6 +107,7 @@  struct QIOChannelClass {
                          size_t niov,
                          int *fds,
                          size_t nfds,
+                         int flags,
                          Error **errp);
     ssize_t (*io_readv)(QIOChannel *ioc,
                         const struct iovec *iov,
@@ -136,6 +140,8 @@  struct QIOChannelClass {
                                   IOHandler *io_read,
                                   IOHandler *io_write,
                                   void *opaque);
+    int (*io_flush)(QIOChannel *ioc,
+                    Error **errp);
 };
 
 /* General I/O handling functions */
@@ -222,12 +228,13 @@  ssize_t qio_channel_readv_full(QIOChannel *ioc,
 
 
 /**
- * qio_channel_writev_full:
+ * qio_channel_writev_full_flags:
  * @ioc: the channel object
  * @iov: the array of memory regions to write data from
  * @niov: the length of the @iov array
  * @fds: an array of file handles to send
  * @nfds: number of file handles in @fds
+ * @flags: write flags (QIO_CHANNEL_WRITE_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  * Write data to the IO channel, reading it from the
@@ -255,12 +262,16 @@  ssize_t qio_channel_readv_full(QIOChannel *ioc,
  * or QIO_CHANNEL_ERR_BLOCK if no data is can be sent
  * and the channel is non-blocking
  */
-ssize_t qio_channel_writev_full(QIOChannel *ioc,
-                                const struct iovec *iov,
-                                size_t niov,
-                                int *fds,
-                                size_t nfds,
-                                Error **errp);
+ssize_t qio_channel_writev_full_flags(QIOChannel *ioc,
+                                      const struct iovec *iov,
+                                      size_t niov,
+                                      int *fds,
+                                      size_t nfds,
+                                      int flags,
+                                      Error **errp);
+
+#define qio_channel_writev_full(ioc, iov, niov, fds, nfds, errp) \
+    qio_channel_writev_full_flags(ioc, iov, niov, fds, nfds, 0, errp)
 
 /**
  * qio_channel_readv_all_eof:
@@ -831,12 +842,13 @@  int qio_channel_readv_full_all(QIOChannel *ioc,
                                Error **errp);
 
 /**
- * qio_channel_writev_full_all:
+ * qio_channel_writev_full_all_flags:
  * @ioc: the channel object
  * @iov: the array of memory regions to write data from
  * @niov: the length of the @iov array
  * @fds: an array of file handles to send
  * @nfds: number of file handles in @fds
+ * @flags: write flags (QIO_CHANNEL_WRITE_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  *
@@ -846,13 +858,42 @@  int qio_channel_readv_full_all(QIOChannel *ioc,
  * to be written, yielding from the current coroutine
  * if required.
  *
+ * If QIO_CHANNEL_WRITE_FLAG_ZERO_COPY is passed in flags,
+ * instead of waiting for all requested data to be written,
+ * this function will wait until it's all queued for writing.
+ * In this case, if the buffer gets changed between queueing and
+ * sending, the updated buffer will be sent. If this is not a
+ * desired behavior, it's suggested to call qio_channel_flush()
+ * before reusing the buffer.
+ *
  * Returns: 0 if all bytes were written, or -1 on error
  */
 
-int qio_channel_writev_full_all(QIOChannel *ioc,
-                                const struct iovec *iov,
-                                size_t niov,
-                                int *fds, size_t nfds,
-                                Error **errp);
+int qio_channel_writev_full_all_flags(QIOChannel *ioc,
+                                      const struct iovec *iov,
+                                      size_t niov,
+                                      int *fds, size_t nfds,
+                                      int flags, Error **errp);
+#define qio_channel_writev_full_all(ioc, iov, niov, fds, nfds, errp) \
+    qio_channel_writev_full_all_flags(ioc, iov, niov, fds, nfds, 0, errp)
+
+/**
+ * qio_channel_flush:
+ * @ioc: the channel object
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Will block until every packet queued with
+ * qio_channel_writev_full_flags() + QIO_CHANNEL_WRITE_FLAG_ZERO_COPY
+ * is sent, or return in case of any error.
+ *
+ * If not implemented, acts as a no-op, and returns 0.
+ *
+ * Returns -1 if any error is found,
+ *          1 if every send failed to use zero copy.
+ *          0 otherwise.
+ */
+
+int qio_channel_flush(QIOChannel *ioc,
+                      Error **errp);
 
 #endif /* QIO_CHANNEL_H */
diff --git a/io/channel-buffer.c b/io/channel-buffer.c
index baa4e2b089..bf52011be2 100644
--- a/io/channel-buffer.c
+++ b/io/channel-buffer.c
@@ -81,6 +81,7 @@  static ssize_t qio_channel_buffer_writev(QIOChannel *ioc,
                                          size_t niov,
                                          int *fds,
                                          size_t nfds,
+                                         int flags,
                                          Error **errp)
 {
     QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
diff --git a/io/channel-command.c b/io/channel-command.c
index b2a9e27138..5ff1691bad 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -258,6 +258,7 @@  static ssize_t qio_channel_command_writev(QIOChannel *ioc,
                                           size_t niov,
                                           int *fds,
                                           size_t nfds,
+                                          int flags,
                                           Error **errp)
 {
     QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
diff --git a/io/channel-file.c b/io/channel-file.c
index c4bf799a80..348a48545e 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -114,6 +114,7 @@  static ssize_t qio_channel_file_writev(QIOChannel *ioc,
                                        size_t niov,
                                        int *fds,
                                        size_t nfds,
+                                       int flags,
                                        Error **errp)
 {
     QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
diff --git a/io/channel-socket.c b/io/channel-socket.c
index 606ec97cf7..bfbd64787e 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -525,6 +525,7 @@  static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
                                          size_t niov,
                                          int *fds,
                                          size_t nfds,
+                                         int flags,
                                          Error **errp)
 {
     QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
@@ -620,6 +621,7 @@  static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
                                          size_t niov,
                                          int *fds,
                                          size_t nfds,
+                                         int flags,
                                          Error **errp)
 {
     QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
diff --git a/io/channel-tls.c b/io/channel-tls.c
index 2ae1b92fc0..4ce890a538 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -301,6 +301,7 @@  static ssize_t qio_channel_tls_writev(QIOChannel *ioc,
                                       size_t niov,
                                       int *fds,
                                       size_t nfds,
+                                      int flags,
                                       Error **errp)
 {
     QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
diff --git a/io/channel-websock.c b/io/channel-websock.c
index 70889bb54d..035dd6075b 100644
--- a/io/channel-websock.c
+++ b/io/channel-websock.c
@@ -1127,6 +1127,7 @@  static ssize_t qio_channel_websock_writev(QIOChannel *ioc,
                                           size_t niov,
                                           int *fds,
                                           size_t nfds,
+                                          int flags,
                                           Error **errp)
 {
     QIOChannelWebsock *wioc = QIO_CHANNEL_WEBSOCK(ioc);
diff --git a/io/channel.c b/io/channel.c
index e8b019dc36..904855e16e 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -67,12 +67,13 @@  ssize_t qio_channel_readv_full(QIOChannel *ioc,
 }
 
 
-ssize_t qio_channel_writev_full(QIOChannel *ioc,
-                                const struct iovec *iov,
-                                size_t niov,
-                                int *fds,
-                                size_t nfds,
-                                Error **errp)
+ssize_t qio_channel_writev_full_flags(QIOChannel *ioc,
+                                      const struct iovec *iov,
+                                      size_t niov,
+                                      int *fds,
+                                      size_t nfds,
+                                      int flags,
+                                      Error **errp)
 {
     QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
 
@@ -83,7 +84,7 @@  ssize_t qio_channel_writev_full(QIOChannel *ioc,
         return -1;
     }
 
-    return klass->io_writev(ioc, iov, niov, fds, nfds, errp);
+    return klass->io_writev(ioc, iov, niov, fds, nfds, flags, errp);
 }
 
 
@@ -217,14 +218,15 @@  int qio_channel_writev_all(QIOChannel *ioc,
                            size_t niov,
                            Error **errp)
 {
-    return qio_channel_writev_full_all(ioc, iov, niov, NULL, 0, errp);
+    return qio_channel_writev_full_all_flags(ioc, iov, niov, NULL, 0, 0,
+                                             errp);
 }
 
-int qio_channel_writev_full_all(QIOChannel *ioc,
-                                const struct iovec *iov,
-                                size_t niov,
-                                int *fds, size_t nfds,
-                                Error **errp)
+int qio_channel_writev_full_all_flags(QIOChannel *ioc,
+                                      const struct iovec *iov,
+                                      size_t niov,
+                                      int *fds, size_t nfds,
+                                      int flags, Error **errp)
 {
     int ret = -1;
     struct iovec *local_iov = g_new(struct iovec, niov);
@@ -235,10 +237,16 @@  int qio_channel_writev_full_all(QIOChannel *ioc,
                           iov, niov,
                           0, iov_size(iov, niov));
 
+    if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
+        assert(fds == NULL && nfds == 0);
+    }
+
     while (nlocal_iov > 0) {
         ssize_t len;
-        len = qio_channel_writev_full(ioc, local_iov, nlocal_iov, fds, nfds,
-                                      errp);
+
+        len = qio_channel_writev_full_flags(ioc, local_iov, nlocal_iov, fds,
+                                            nfds, flags, errp);
+
         if (len == QIO_CHANNEL_ERR_BLOCK) {
             if (qemu_in_coroutine()) {
                 qio_channel_yield(ioc, G_IO_OUT);
@@ -473,6 +481,19 @@  off_t qio_channel_io_seek(QIOChannel *ioc,
     return klass->io_seek(ioc, offset, whence, errp);
 }
 
+int qio_channel_flush(QIOChannel *ioc,
+                                Error **errp)
+{
+    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+    if (!klass->io_flush ||
+        !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY)) {
+        return 0;
+    }
+
+    return klass->io_flush(ioc, errp);
+}
+
 
 static void qio_channel_restart_read(void *opaque)
 {
diff --git a/migration/rdma.c b/migration/rdma.c
index f5d3bbe7e9..54acd2000e 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2833,6 +2833,7 @@  static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
                                        size_t niov,
                                        int *fds,
                                        size_t nfds,
+                                       int flags,
                                        Error **errp)
 {
     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);