diff mbox series

[v3,1/2] io: Add support for MSG_PEEK for socket channel

Message ID 20221119093615.158072-4-manish.mishra@nutanix.com (mailing list archive)
State New, archived
Headers show
Series [v3,1/2] io: Add support for MSG_PEEK for socket channel | expand

Commit Message

manish.mishra Nov. 19, 2022, 9:36 a.m. UTC
MSG_PEEK reads from the peek of channel, The data is treated as
unread and the next read shall still return this data. This
support is currently added only for socket class. Extra parameter
'flags' is added to io_readv calls to pass extra read flags like
MSG_PEEK.

Suggested-by: Daniel P. Berrangé <berrange@redhat.com
Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
---
 chardev/char-socket.c               |  4 +-
 include/io/channel.h                | 83 +++++++++++++++++++++++++++++
 io/channel-buffer.c                 |  1 +
 io/channel-command.c                |  1 +
 io/channel-file.c                   |  1 +
 io/channel-null.c                   |  1 +
 io/channel-socket.c                 | 16 +++++-
 io/channel-tls.c                    |  1 +
 io/channel-websock.c                |  1 +
 io/channel.c                        | 73 +++++++++++++++++++++++--
 migration/channel-block.c           |  1 +
 scsi/qemu-pr-helper.c               |  2 +-
 tests/qtest/tpm-emu.c               |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c            |  2 +-
 15 files changed, 179 insertions(+), 11 deletions(-)

Comments

Daniel P. Berrangé Nov. 22, 2022, 9 a.m. UTC | #1
On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> MSG_PEEK reads from the peek of channel, The data is treated as
> unread and the next read shall still return this data. This
> support is currently added only for socket class. Extra parameter
> 'flags' is added to io_readv calls to pass extra read flags like
> MSG_PEEK.
> 
> Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
> ---
>  chardev/char-socket.c               |  4 +-
>  include/io/channel.h                | 83 +++++++++++++++++++++++++++++
>  io/channel-buffer.c                 |  1 +
>  io/channel-command.c                |  1 +
>  io/channel-file.c                   |  1 +
>  io/channel-null.c                   |  1 +
>  io/channel-socket.c                 | 16 +++++-
>  io/channel-tls.c                    |  1 +
>  io/channel-websock.c                |  1 +
>  io/channel.c                        | 73 +++++++++++++++++++++++--
>  migration/channel-block.c           |  1 +
>  scsi/qemu-pr-helper.c               |  2 +-
>  tests/qtest/tpm-emu.c               |  2 +-
>  tests/unit/test-io-channel-socket.c |  1 +
>  util/vhost-user-server.c            |  2 +-
>  15 files changed, 179 insertions(+), 11 deletions(-)



> diff --git a/io/channel-socket.c b/io/channel-socket.c
> index b76dca9cc1..a06b24766d 100644
> --- a/io/channel-socket.c
> +++ b/io/channel-socket.c
> @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
>      }
>  #endif /* WIN32 */
>  
> +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> +

This covers the incoming server side socket.

This also needs to be set in outgoing client side socket in
qio_channel_socket_connect_async


> @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
>  }
>  #endif /* WIN32 */
>  
> -
>  #ifdef QEMU_MSG_ZEROCOPY
>  static int qio_channel_socket_flush(QIOChannel *ioc,
>                                      Error **errp)

Please remove this unrelated whitespace change.


> @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
>      return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
>  }
>  
> +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> +                                   const struct iovec *iov,
> +                                   size_t niov,
> +                                   Error **errp)
> +{
> +   ssize_t len = 0;
> +   ssize_t total = iov_size(iov, niov);
> +
> +   while (len < total) {
> +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> +
> +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> +            if (qemu_in_coroutine()) {
> +                qio_channel_yield(ioc, G_IO_IN);
> +            } else {
> +                qio_channel_wait(ioc, G_IO_IN);
> +            }
> +            continue;
> +       }
> +       if (len == 0) {
> +           return 0;
> +       }
> +       if (len < 0) {
> +           return -1;
> +       }
> +   }

This will busy wait burning CPU where there is a read > 0 and < total.


With regards,
Daniel
manish.mishra Nov. 22, 2022, 9:08 a.m. UTC | #2
On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
>> MSG_PEEK reads from the peek of channel, The data is treated as
>> unread and the next read shall still return this data. This
>> support is currently added only for socket class. Extra parameter
>> 'flags' is added to io_readv calls to pass extra read flags like
>> MSG_PEEK.
>>
>> Suggested-by: Daniel P. Berrangé <berrange@redhat.com
>> Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
>> ---
>>   chardev/char-socket.c               |  4 +-
>>   include/io/channel.h                | 83 +++++++++++++++++++++++++++++
>>   io/channel-buffer.c                 |  1 +
>>   io/channel-command.c                |  1 +
>>   io/channel-file.c                   |  1 +
>>   io/channel-null.c                   |  1 +
>>   io/channel-socket.c                 | 16 +++++-
>>   io/channel-tls.c                    |  1 +
>>   io/channel-websock.c                |  1 +
>>   io/channel.c                        | 73 +++++++++++++++++++++++--
>>   migration/channel-block.c           |  1 +
>>   scsi/qemu-pr-helper.c               |  2 +-
>>   tests/qtest/tpm-emu.c               |  2 +-
>>   tests/unit/test-io-channel-socket.c |  1 +
>>   util/vhost-user-server.c            |  2 +-
>>   15 files changed, 179 insertions(+), 11 deletions(-)
>
>
>> diff --git a/io/channel-socket.c b/io/channel-socket.c
>> index b76dca9cc1..a06b24766d 100644
>> --- a/io/channel-socket.c
>> +++ b/io/channel-socket.c
>> @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
>>       }
>>   #endif /* WIN32 */
>>   
>> +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
>> +
> This covers the incoming server side socket.
>
> This also needs to be set in outgoing client side socket in
> qio_channel_socket_connect_async


Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.

>
>> @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
>>   }
>>   #endif /* WIN32 */
>>   
>> -
>>   #ifdef QEMU_MSG_ZEROCOPY
>>   static int qio_channel_socket_flush(QIOChannel *ioc,
>>                                       Error **errp)
> Please remove this unrelated whitespace change.
>
>
>> @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
>>       return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
>>   }
>>   
>> +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
>> +                                   const struct iovec *iov,
>> +                                   size_t niov,
>> +                                   Error **errp)
>> +{
>> +   ssize_t len = 0;
>> +   ssize_t total = iov_size(iov, niov);
>> +
>> +   while (len < total) {
>> +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
>> +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
>> +
>> +       if (len == QIO_CHANNEL_ERR_BLOCK) {
>> +            if (qemu_in_coroutine()) {
>> +                qio_channel_yield(ioc, G_IO_IN);
>> +            } else {
>> +                qio_channel_wait(ioc, G_IO_IN);
>> +            }
>> +            continue;
>> +       }
>> +       if (len == 0) {
>> +           return 0;
>> +       }
>> +       if (len < 0) {
>> +           return -1;
>> +       }
>> +   }
> This will busy wait burning CPU where there is a read > 0 and < total.
>

Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.


> With regards,
> Daniel


Thanks

Manish Mishra
Daniel P. Berrangé Nov. 22, 2022, 9:29 a.m. UTC | #3
On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> 
> On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > unread and the next read shall still return this data. This
> > > support is currently added only for socket class. Extra parameter
> > > 'flags' is added to io_readv calls to pass extra read flags like
> > > MSG_PEEK.
> > > 
> > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
> > > ---
> > >   chardev/char-socket.c               |  4 +-
> > >   include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > >   io/channel-buffer.c                 |  1 +
> > >   io/channel-command.c                |  1 +
> > >   io/channel-file.c                   |  1 +
> > >   io/channel-null.c                   |  1 +
> > >   io/channel-socket.c                 | 16 +++++-
> > >   io/channel-tls.c                    |  1 +
> > >   io/channel-websock.c                |  1 +
> > >   io/channel.c                        | 73 +++++++++++++++++++++++--
> > >   migration/channel-block.c           |  1 +
> > >   scsi/qemu-pr-helper.c               |  2 +-
> > >   tests/qtest/tpm-emu.c               |  2 +-
> > >   tests/unit/test-io-channel-socket.c |  1 +
> > >   util/vhost-user-server.c            |  2 +-
> > >   15 files changed, 179 insertions(+), 11 deletions(-)
> > 
> > 
> > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > index b76dca9cc1..a06b24766d 100644
> > > --- a/io/channel-socket.c
> > > +++ b/io/channel-socket.c
> > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > >       }
> > >   #endif /* WIN32 */
> > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > +
> > This covers the incoming server side socket.
> > 
> > This also needs to be set in outgoing client side socket in
> > qio_channel_socket_connect_async
> 
> 
> Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> 
> > 
> > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > >   }
> > >   #endif /* WIN32 */
> > > -
> > >   #ifdef QEMU_MSG_ZEROCOPY
> > >   static int qio_channel_socket_flush(QIOChannel *ioc,
> > >                                       Error **errp)
> > Please remove this unrelated whitespace change.
> > 
> > 
> > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > >       return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > >   }
> > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > +                                   const struct iovec *iov,
> > > +                                   size_t niov,
> > > +                                   Error **errp)
> > > +{
> > > +   ssize_t len = 0;
> > > +   ssize_t total = iov_size(iov, niov);
> > > +
> > > +   while (len < total) {
> > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > +
> > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > +            if (qemu_in_coroutine()) {
> > > +                qio_channel_yield(ioc, G_IO_IN);
> > > +            } else {
> > > +                qio_channel_wait(ioc, G_IO_IN);
> > > +            }
> > > +            continue;
> > > +       }
> > > +       if (len == 0) {
> > > +           return 0;
> > > +       }
> > > +       if (len < 0) {
> > > +           return -1;
> > > +       }
> > > +   }
> > This will busy wait burning CPU where there is a read > 0 and < total.
> > 
> 
> Daniel, i could use MSG_WAITALL too if that works but then we will
> lose opportunity to yield. Or if you have some other idea.

I fear this is an inherant problem with the idea of using PEEK to
look at the magic data.

If we actually read the magic bytes off the wire, then we could have
the same code path for TLS and non-TLS. We would have to modify the
existing later code paths though to take account of fact that the
magic was already read by an earlier codepath.

With regards,
Daniel
manish.mishra Nov. 22, 2022, 9:40 a.m. UTC | #4
On 22/11/22 2:59 pm, Daniel P. Berrangé wrote:
> On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
>> On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
>>> On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
>>>> MSG_PEEK reads from the peek of channel, The data is treated as
>>>> unread and the next read shall still return this data. This
>>>> support is currently added only for socket class. Extra parameter
>>>> 'flags' is added to io_readv calls to pass extra read flags like
>>>> MSG_PEEK.
>>>>
>>>> Suggested-by: Daniel P. Berrangé <berrange@redhat.com
>>>> Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
>>>> ---
>>>>    chardev/char-socket.c               |  4 +-
>>>>    include/io/channel.h                | 83 +++++++++++++++++++++++++++++
>>>>    io/channel-buffer.c                 |  1 +
>>>>    io/channel-command.c                |  1 +
>>>>    io/channel-file.c                   |  1 +
>>>>    io/channel-null.c                   |  1 +
>>>>    io/channel-socket.c                 | 16 +++++-
>>>>    io/channel-tls.c                    |  1 +
>>>>    io/channel-websock.c                |  1 +
>>>>    io/channel.c                        | 73 +++++++++++++++++++++++--
>>>>    migration/channel-block.c           |  1 +
>>>>    scsi/qemu-pr-helper.c               |  2 +-
>>>>    tests/qtest/tpm-emu.c               |  2 +-
>>>>    tests/unit/test-io-channel-socket.c |  1 +
>>>>    util/vhost-user-server.c            |  2 +-
>>>>    15 files changed, 179 insertions(+), 11 deletions(-)
>>>
>>>> diff --git a/io/channel-socket.c b/io/channel-socket.c
>>>> index b76dca9cc1..a06b24766d 100644
>>>> --- a/io/channel-socket.c
>>>> +++ b/io/channel-socket.c
>>>> @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
>>>>        }
>>>>    #endif /* WIN32 */
>>>> +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
>>>> +
>>> This covers the incoming server side socket.
>>>
>>> This also needs to be set in outgoing client side socket in
>>> qio_channel_socket_connect_async
>>
>> Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
>>
>>>> @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
>>>>    }
>>>>    #endif /* WIN32 */
>>>> -
>>>>    #ifdef QEMU_MSG_ZEROCOPY
>>>>    static int qio_channel_socket_flush(QIOChannel *ioc,
>>>>                                        Error **errp)
>>> Please remove this unrelated whitespace change.
>>>
>>>
>>>> @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
>>>>        return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
>>>>    }
>>>> +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
>>>> +                                   const struct iovec *iov,
>>>> +                                   size_t niov,
>>>> +                                   Error **errp)
>>>> +{
>>>> +   ssize_t len = 0;
>>>> +   ssize_t total = iov_size(iov, niov);
>>>> +
>>>> +   while (len < total) {
>>>> +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
>>>> +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
>>>> +
>>>> +       if (len == QIO_CHANNEL_ERR_BLOCK) {
>>>> +            if (qemu_in_coroutine()) {
>>>> +                qio_channel_yield(ioc, G_IO_IN);
>>>> +            } else {
>>>> +                qio_channel_wait(ioc, G_IO_IN);
>>>> +            }
>>>> +            continue;
>>>> +       }
>>>> +       if (len == 0) {
>>>> +           return 0;
>>>> +       }
>>>> +       if (len < 0) {
>>>> +           return -1;
>>>> +       }
>>>> +   }
>>> This will busy wait burning CPU where there is a read > 0 and < total.
>>>
>> Daniel, i could use MSG_WAITALL too if that works but then we will
>> lose opportunity to yield. Or if you have some other idea.
> I fear this is an inherant problem with the idea of using PEEK to
> look at the magic data.
>
> If we actually read the magic bytes off the wire, then we could have
> the same code path for TLS and non-TLS. We would have to modify the
> existing later code paths though to take account of fact that the
> magic was already read by an earlier codepath.
>
> With regards,
> Daniel


sure Daniel, I am happy to drop use of MSG_PEEK, but that way also we have issue with tls for reason we discussed in V2. Is it okay to send a patch with actual read ahead but not for tls case? tls anyway does not have this bug as it does handshake.

Thanks

Manish Mishra
Daniel P. Berrangé Nov. 22, 2022, 9:53 a.m. UTC | #5
On Tue, Nov 22, 2022 at 03:10:53PM +0530, manish.mishra wrote:
> 
> On 22/11/22 2:59 pm, Daniel P. Berrangé wrote:
> > On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> > > On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > > > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > > > unread and the next read shall still return this data. This
> > > > > support is currently added only for socket class. Extra parameter
> > > > > 'flags' is added to io_readv calls to pass extra read flags like
> > > > > MSG_PEEK.
> > > > > 
> > > > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > > > Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
> > > > > ---
> > > > >    chardev/char-socket.c               |  4 +-
> > > > >    include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > > > >    io/channel-buffer.c                 |  1 +
> > > > >    io/channel-command.c                |  1 +
> > > > >    io/channel-file.c                   |  1 +
> > > > >    io/channel-null.c                   |  1 +
> > > > >    io/channel-socket.c                 | 16 +++++-
> > > > >    io/channel-tls.c                    |  1 +
> > > > >    io/channel-websock.c                |  1 +
> > > > >    io/channel.c                        | 73 +++++++++++++++++++++++--
> > > > >    migration/channel-block.c           |  1 +
> > > > >    scsi/qemu-pr-helper.c               |  2 +-
> > > > >    tests/qtest/tpm-emu.c               |  2 +-
> > > > >    tests/unit/test-io-channel-socket.c |  1 +
> > > > >    util/vhost-user-server.c            |  2 +-
> > > > >    15 files changed, 179 insertions(+), 11 deletions(-)
> > > > 
> > > > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > > > index b76dca9cc1..a06b24766d 100644
> > > > > --- a/io/channel-socket.c
> > > > > +++ b/io/channel-socket.c
> > > > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > > > >        }
> > > > >    #endif /* WIN32 */
> > > > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > > > +
> > > > This covers the incoming server side socket.
> > > > 
> > > > This also needs to be set in outgoing client side socket in
> > > > qio_channel_socket_connect_async
> > > 
> > > Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> > > 
> > > > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > >    }
> > > > >    #endif /* WIN32 */
> > > > > -
> > > > >    #ifdef QEMU_MSG_ZEROCOPY
> > > > >    static int qio_channel_socket_flush(QIOChannel *ioc,
> > > > >                                        Error **errp)
> > > > Please remove this unrelated whitespace change.
> > > > 
> > > > 
> > > > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > > > >        return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > > > >    }
> > > > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > > > +                                   const struct iovec *iov,
> > > > > +                                   size_t niov,
> > > > > +                                   Error **errp)
> > > > > +{
> > > > > +   ssize_t len = 0;
> > > > > +   ssize_t total = iov_size(iov, niov);
> > > > > +
> > > > > +   while (len < total) {
> > > > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > > > +
> > > > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > > > +            if (qemu_in_coroutine()) {
> > > > > +                qio_channel_yield(ioc, G_IO_IN);
> > > > > +            } else {
> > > > > +                qio_channel_wait(ioc, G_IO_IN);
> > > > > +            }
> > > > > +            continue;
> > > > > +       }
> > > > > +       if (len == 0) {
> > > > > +           return 0;
> > > > > +       }
> > > > > +       if (len < 0) {
> > > > > +           return -1;
> > > > > +       }
> > > > > +   }
> > > > This will busy wait burning CPU where there is a read > 0 and < total.
> > > > 
> > > Daniel, i could use MSG_WAITALL too if that works but then we will
> > > lose opportunity to yield. Or if you have some other idea.
> > I fear this is an inherant problem with the idea of using PEEK to
> > look at the magic data.
> > 
> > If we actually read the magic bytes off the wire, then we could have
> > the same code path for TLS and non-TLS. We would have to modify the
> > existing later code paths though to take account of fact that the
> > magic was already read by an earlier codepath.
> > 
> > With regards,
> > Daniel
> 
> 
> sure Daniel, I am happy to drop use of MSG_PEEK, but that way also we
> have issue with tls for reason we discussed in V2. Is it okay to send
> a patch with actual read ahead but not for tls case? tls anyway does
> not have this bug as it does handshake.

I've re-read the previous threads, but I don't see what the problem
with TLS is.  We already decided that TLS is not affected by the
race condition. So there should be no problem in reading the magic
bytes early on the TLS channels, while reading the bytes early on
a non-TLS channel will fix the race condition. 

With regards,
Daniel
manish.mishra Nov. 22, 2022, 10:13 a.m. UTC | #6
On 22/11/22 3:23 pm, Daniel P. Berrangé wrote:
> On Tue, Nov 22, 2022 at 03:10:53PM +0530, manish.mishra wrote:
>> On 22/11/22 2:59 pm, Daniel P. Berrangé wrote:
>>> On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
>>>> On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
>>>>> On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
>>>>>> MSG_PEEK reads from the peek of channel, The data is treated as
>>>>>> unread and the next read shall still return this data. This
>>>>>> support is currently added only for socket class. Extra parameter
>>>>>> 'flags' is added to io_readv calls to pass extra read flags like
>>>>>> MSG_PEEK.
>>>>>>
>>>>>> Suggested-by: Daniel P. Berrangé <berrange@redhat.com
>>>>>> Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
>>>>>> ---
>>>>>>     chardev/char-socket.c               |  4 +-
>>>>>>     include/io/channel.h                | 83 +++++++++++++++++++++++++++++
>>>>>>     io/channel-buffer.c                 |  1 +
>>>>>>     io/channel-command.c                |  1 +
>>>>>>     io/channel-file.c                   |  1 +
>>>>>>     io/channel-null.c                   |  1 +
>>>>>>     io/channel-socket.c                 | 16 +++++-
>>>>>>     io/channel-tls.c                    |  1 +
>>>>>>     io/channel-websock.c                |  1 +
>>>>>>     io/channel.c                        | 73 +++++++++++++++++++++++--
>>>>>>     migration/channel-block.c           |  1 +
>>>>>>     scsi/qemu-pr-helper.c               |  2 +-
>>>>>>     tests/qtest/tpm-emu.c               |  2 +-
>>>>>>     tests/unit/test-io-channel-socket.c |  1 +
>>>>>>     util/vhost-user-server.c            |  2 +-
>>>>>>     15 files changed, 179 insertions(+), 11 deletions(-)
>>>>>> diff --git a/io/channel-socket.c b/io/channel-socket.c
>>>>>> index b76dca9cc1..a06b24766d 100644
>>>>>> --- a/io/channel-socket.c
>>>>>> +++ b/io/channel-socket.c
>>>>>> @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
>>>>>>         }
>>>>>>     #endif /* WIN32 */
>>>>>> +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
>>>>>> +
>>>>> This covers the incoming server side socket.
>>>>>
>>>>> This also needs to be set in outgoing client side socket in
>>>>> qio_channel_socket_connect_async
>>>> Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
>>>>
>>>>>> @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
>>>>>>     }
>>>>>>     #endif /* WIN32 */
>>>>>> -
>>>>>>     #ifdef QEMU_MSG_ZEROCOPY
>>>>>>     static int qio_channel_socket_flush(QIOChannel *ioc,
>>>>>>                                         Error **errp)
>>>>> Please remove this unrelated whitespace change.
>>>>>
>>>>>
>>>>>> @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
>>>>>>         return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
>>>>>>     }
>>>>>> +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
>>>>>> +                                   const struct iovec *iov,
>>>>>> +                                   size_t niov,
>>>>>> +                                   Error **errp)
>>>>>> +{
>>>>>> +   ssize_t len = 0;
>>>>>> +   ssize_t total = iov_size(iov, niov);
>>>>>> +
>>>>>> +   while (len < total) {
>>>>>> +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
>>>>>> +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
>>>>>> +
>>>>>> +       if (len == QIO_CHANNEL_ERR_BLOCK) {
>>>>>> +            if (qemu_in_coroutine()) {
>>>>>> +                qio_channel_yield(ioc, G_IO_IN);
>>>>>> +            } else {
>>>>>> +                qio_channel_wait(ioc, G_IO_IN);
>>>>>> +            }
>>>>>> +            continue;
>>>>>> +       }
>>>>>> +       if (len == 0) {
>>>>>> +           return 0;
>>>>>> +       }
>>>>>> +       if (len < 0) {
>>>>>> +           return -1;
>>>>>> +       }
>>>>>> +   }
>>>>> This will busy wait burning CPU where there is a read > 0 and < total.
>>>>>
>>>> Daniel, i could use MSG_WAITALL too if that works but then we will
>>>> lose opportunity to yield. Or if you have some other idea.
>>> I fear this is an inherant problem with the idea of using PEEK to
>>> look at the magic data.
>>>
>>> If we actually read the magic bytes off the wire, then we could have
>>> the same code path for TLS and non-TLS. We would have to modify the
>>> existing later code paths though to take account of fact that the
>>> magic was already read by an earlier codepath.
>>>
>>> With regards,
>>> Daniel
>>
>> sure Daniel, I am happy to drop use of MSG_PEEK, but that way also we
>> have issue with tls for reason we discussed in V2. Is it okay to send
>> a patch with actual read ahead but not for tls case? tls anyway does
>> not have this bug as it does handshake.
> I've re-read the previous threads, but I don't see what the problem
> with TLS is.  We already decided that TLS is not affected by the
> race condition. So there should be no problem in reading the magic
> bytes early on the TLS channels, while reading the bytes early on
> a non-TLS channel will fix the race condition.


Actually with tls all channels requires handshake to be assumed established, and from source side we do initial qemu_flush only when all channels are established. But on destination side we will stuck on reading magic for main channel itself which never comes because source has not flushed data, so no new connections can be established(e.g. multiFD). So basically destination can not accept any new channel until we read from main channel and source is not putting any data on main channel until all channels are established. So if we read ahread in ioc_process_incoming_channel there is this deadlock with tls. This issue is not there with non-tls case, because there on source side we assume a connection established once connect() call is successful.


Thanks

Manish Mishra

>
> With regards,
> Daniel
Daniel P. Berrangé Nov. 22, 2022, 10:31 a.m. UTC | #7
On Tue, Nov 22, 2022 at 03:43:55PM +0530, manish.mishra wrote:
> 
> On 22/11/22 3:23 pm, Daniel P. Berrangé wrote:
> > On Tue, Nov 22, 2022 at 03:10:53PM +0530, manish.mishra wrote:
> > > On 22/11/22 2:59 pm, Daniel P. Berrangé wrote:
> > > > On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> > > > > On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > > > > > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > > > > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > > > > > unread and the next read shall still return this data. This
> > > > > > > support is currently added only for socket class. Extra parameter
> > > > > > > 'flags' is added to io_readv calls to pass extra read flags like
> > > > > > > MSG_PEEK.
> > > > > > > 
> > > > > > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > > > > > Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
> > > > > > > ---
> > > > > > >     chardev/char-socket.c               |  4 +-
> > > > > > >     include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > > > > > >     io/channel-buffer.c                 |  1 +
> > > > > > >     io/channel-command.c                |  1 +
> > > > > > >     io/channel-file.c                   |  1 +
> > > > > > >     io/channel-null.c                   |  1 +
> > > > > > >     io/channel-socket.c                 | 16 +++++-
> > > > > > >     io/channel-tls.c                    |  1 +
> > > > > > >     io/channel-websock.c                |  1 +
> > > > > > >     io/channel.c                        | 73 +++++++++++++++++++++++--
> > > > > > >     migration/channel-block.c           |  1 +
> > > > > > >     scsi/qemu-pr-helper.c               |  2 +-
> > > > > > >     tests/qtest/tpm-emu.c               |  2 +-
> > > > > > >     tests/unit/test-io-channel-socket.c |  1 +
> > > > > > >     util/vhost-user-server.c            |  2 +-
> > > > > > >     15 files changed, 179 insertions(+), 11 deletions(-)
> > > > > > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > > > > > index b76dca9cc1..a06b24766d 100644
> > > > > > > --- a/io/channel-socket.c
> > > > > > > +++ b/io/channel-socket.c
> > > > > > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > > > > > >         }
> > > > > > >     #endif /* WIN32 */
> > > > > > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > > > > > +
> > > > > > This covers the incoming server side socket.
> > > > > > 
> > > > > > This also needs to be set in outgoing client side socket in
> > > > > > qio_channel_socket_connect_async
> > > > > Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> > > > > 
> > > > > > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > > > >     }
> > > > > > >     #endif /* WIN32 */
> > > > > > > -
> > > > > > >     #ifdef QEMU_MSG_ZEROCOPY
> > > > > > >     static int qio_channel_socket_flush(QIOChannel *ioc,
> > > > > > >                                         Error **errp)
> > > > > > Please remove this unrelated whitespace change.
> > > > > > 
> > > > > > 
> > > > > > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > > > > > >         return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > > > > > >     }
> > > > > > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > > > > > +                                   const struct iovec *iov,
> > > > > > > +                                   size_t niov,
> > > > > > > +                                   Error **errp)
> > > > > > > +{
> > > > > > > +   ssize_t len = 0;
> > > > > > > +   ssize_t total = iov_size(iov, niov);
> > > > > > > +
> > > > > > > +   while (len < total) {
> > > > > > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > > > > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > > > > > +
> > > > > > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > > > > > +            if (qemu_in_coroutine()) {
> > > > > > > +                qio_channel_yield(ioc, G_IO_IN);
> > > > > > > +            } else {
> > > > > > > +                qio_channel_wait(ioc, G_IO_IN);
> > > > > > > +            }
> > > > > > > +            continue;
> > > > > > > +       }
> > > > > > > +       if (len == 0) {
> > > > > > > +           return 0;
> > > > > > > +       }
> > > > > > > +       if (len < 0) {
> > > > > > > +           return -1;
> > > > > > > +       }
> > > > > > > +   }
> > > > > > This will busy wait burning CPU where there is a read > 0 and < total.
> > > > > > 
> > > > > Daniel, i could use MSG_WAITALL too if that works but then we will
> > > > > lose opportunity to yield. Or if you have some other idea.
> > > > I fear this is an inherant problem with the idea of using PEEK to
> > > > look at the magic data.
> > > > 
> > > > If we actually read the magic bytes off the wire, then we could have
> > > > the same code path for TLS and non-TLS. We would have to modify the
> > > > existing later code paths though to take account of fact that the
> > > > magic was already read by an earlier codepath.
> > > > 
> > > > With regards,
> > > > Daniel
> > > 
> > > sure Daniel, I am happy to drop use of MSG_PEEK, but that way also we
> > > have issue with tls for reason we discussed in V2. Is it okay to send
> > > a patch with actual read ahead but not for tls case? tls anyway does
> > > not have this bug as it does handshake.
> > I've re-read the previous threads, but I don't see what the problem
> > with TLS is.  We already decided that TLS is not affected by the
> > race condition. So there should be no problem in reading the magic
> > bytes early on the TLS channels, while reading the bytes early on
> > a non-TLS channel will fix the race condition.
> 
> 
> Actually with tls all channels requires handshake to be assumed established,
> and from source side we do initial qemu_flush only when all channels are
> established. But on destination side we will stuck on reading magic for
> main channel itself which never comes because source has not flushed data,
> so no new connections can be established(e.g. multiFD). So basically
> destination can not accept any new channel until we read from main
> channel and source is not putting any data on main channel until all
> channels are established. So if we read ahread in
> ioc_process_incoming_channel there is this deadlock with tls. This issue
> is not there with non-tls case, because there on source side we assume a
> connection established once connect() call is successful.

Ah yes, I forgot about the 'flush' problem. Reading magic in non-TLS case
is OK then i guess.

With regards,
Daniel
Peter Xu Nov. 22, 2022, 2:41 p.m. UTC | #8
On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> 
> On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > unread and the next read shall still return this data. This
> > > support is currently added only for socket class. Extra parameter
> > > 'flags' is added to io_readv calls to pass extra read flags like
> > > MSG_PEEK.
> > > 
> > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
> > > ---
> > >   chardev/char-socket.c               |  4 +-
> > >   include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > >   io/channel-buffer.c                 |  1 +
> > >   io/channel-command.c                |  1 +
> > >   io/channel-file.c                   |  1 +
> > >   io/channel-null.c                   |  1 +
> > >   io/channel-socket.c                 | 16 +++++-
> > >   io/channel-tls.c                    |  1 +
> > >   io/channel-websock.c                |  1 +
> > >   io/channel.c                        | 73 +++++++++++++++++++++++--
> > >   migration/channel-block.c           |  1 +
> > >   scsi/qemu-pr-helper.c               |  2 +-
> > >   tests/qtest/tpm-emu.c               |  2 +-
> > >   tests/unit/test-io-channel-socket.c |  1 +
> > >   util/vhost-user-server.c            |  2 +-
> > >   15 files changed, 179 insertions(+), 11 deletions(-)
> > 
> > 
> > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > index b76dca9cc1..a06b24766d 100644
> > > --- a/io/channel-socket.c
> > > +++ b/io/channel-socket.c
> > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > >       }
> > >   #endif /* WIN32 */
> > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > +
> > This covers the incoming server side socket.
> > 
> > This also needs to be set in outgoing client side socket in
> > qio_channel_socket_connect_async
> 
> 
> Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> 
> > 
> > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > >   }
> > >   #endif /* WIN32 */
> > > -
> > >   #ifdef QEMU_MSG_ZEROCOPY
> > >   static int qio_channel_socket_flush(QIOChannel *ioc,
> > >                                       Error **errp)
> > Please remove this unrelated whitespace change.
> > 
> > 
> > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > >       return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > >   }
> > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > +                                   const struct iovec *iov,
> > > +                                   size_t niov,
> > > +                                   Error **errp)
> > > +{
> > > +   ssize_t len = 0;
> > > +   ssize_t total = iov_size(iov, niov);
> > > +
> > > +   while (len < total) {
> > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > +
> > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > +            if (qemu_in_coroutine()) {
> > > +                qio_channel_yield(ioc, G_IO_IN);
> > > +            } else {
> > > +                qio_channel_wait(ioc, G_IO_IN);
> > > +            }
> > > +            continue;
> > > +       }
> > > +       if (len == 0) {
> > > +           return 0;
> > > +       }
> > > +       if (len < 0) {
> > > +           return -1;
> > > +       }
> > > +   }
> > This will busy wait burning CPU where there is a read > 0 and < total.
> > 
> 
> Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.

How easy would this happen?

Another alternative is we could just return the partial len to caller then
we fallback to the original channel orders if it happens.  And then if it
mostly will never happen it'll behave merely the same as what we want.

The thing is the other approach will be hacky in another way (have a flag
migration_consumed_4_bytes_header to either main and multifd channels),
then if it'll solve 99.99% cases I'd think it's good enough.  Anyway we're
working on a corner case already on unreliable network, and even if failure
triggered it's not so bad - we just redo the migration.
Daniel P. Berrangé Nov. 22, 2022, 2:49 p.m. UTC | #9
On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:
> On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> > 
> > On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > > unread and the next read shall still return this data. This
> > > > support is currently added only for socket class. Extra parameter
> > > > 'flags' is added to io_readv calls to pass extra read flags like
> > > > MSG_PEEK.
> > > > 
> > > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > > Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
> > > > ---
> > > >   chardev/char-socket.c               |  4 +-
> > > >   include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > > >   io/channel-buffer.c                 |  1 +
> > > >   io/channel-command.c                |  1 +
> > > >   io/channel-file.c                   |  1 +
> > > >   io/channel-null.c                   |  1 +
> > > >   io/channel-socket.c                 | 16 +++++-
> > > >   io/channel-tls.c                    |  1 +
> > > >   io/channel-websock.c                |  1 +
> > > >   io/channel.c                        | 73 +++++++++++++++++++++++--
> > > >   migration/channel-block.c           |  1 +
> > > >   scsi/qemu-pr-helper.c               |  2 +-
> > > >   tests/qtest/tpm-emu.c               |  2 +-
> > > >   tests/unit/test-io-channel-socket.c |  1 +
> > > >   util/vhost-user-server.c            |  2 +-
> > > >   15 files changed, 179 insertions(+), 11 deletions(-)
> > > 
> > > 
> > > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > > index b76dca9cc1..a06b24766d 100644
> > > > --- a/io/channel-socket.c
> > > > +++ b/io/channel-socket.c
> > > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > > >       }
> > > >   #endif /* WIN32 */
> > > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > > +
> > > This covers the incoming server side socket.
> > > 
> > > This also needs to be set in outgoing client side socket in
> > > qio_channel_socket_connect_async
> > 
> > 
> > Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> > 
> > > 
> > > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > >   }
> > > >   #endif /* WIN32 */
> > > > -
> > > >   #ifdef QEMU_MSG_ZEROCOPY
> > > >   static int qio_channel_socket_flush(QIOChannel *ioc,
> > > >                                       Error **errp)
> > > Please remove this unrelated whitespace change.
> > > 
> > > 
> > > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > > >       return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > > >   }
> > > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > > +                                   const struct iovec *iov,
> > > > +                                   size_t niov,
> > > > +                                   Error **errp)
> > > > +{
> > > > +   ssize_t len = 0;
> > > > +   ssize_t total = iov_size(iov, niov);
> > > > +
> > > > +   while (len < total) {
> > > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > > +
> > > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > > +            if (qemu_in_coroutine()) {
> > > > +                qio_channel_yield(ioc, G_IO_IN);
> > > > +            } else {
> > > > +                qio_channel_wait(ioc, G_IO_IN);
> > > > +            }
> > > > +            continue;
> > > > +       }
> > > > +       if (len == 0) {
> > > > +           return 0;
> > > > +       }
> > > > +       if (len < 0) {
> > > > +           return -1;
> > > > +       }
> > > > +   }
> > > This will busy wait burning CPU where there is a read > 0 and < total.
> > > 
> > 
> > Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.
> 
> How easy would this happen?
> 
> Another alternative is we could just return the partial len to caller then
> we fallback to the original channel orders if it happens.  And then if it
> mostly will never happen it'll behave merely the same as what we want.

Well we're trying to deal with a bug where the slow and/or unreliable
network causes channels to arrive in unexpected order. Given we know
we're having network trouble, I wouldn't want to make more assumptions
about things happening correctly.


With regards,
Daniel
manish.mishra Nov. 22, 2022, 3:31 p.m. UTC | #10
On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:
> On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:
>> On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
>>> On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
>>>> On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
>>>>> MSG_PEEK reads from the peek of channel, The data is treated as
>>>>> unread and the next read shall still return this data. This
>>>>> support is currently added only for socket class. Extra parameter
>>>>> 'flags' is added to io_readv calls to pass extra read flags like
>>>>> MSG_PEEK.
>>>>>
>>>>> Suggested-by: Daniel P. Berrangé <berrange@redhat.com
>>>>> Signed-off-by: manish.mishra<manish.mishra@nutanix.com>
>>>>> ---
>>>>>    chardev/char-socket.c               |  4 +-
>>>>>    include/io/channel.h                | 83 +++++++++++++++++++++++++++++
>>>>>    io/channel-buffer.c                 |  1 +
>>>>>    io/channel-command.c                |  1 +
>>>>>    io/channel-file.c                   |  1 +
>>>>>    io/channel-null.c                   |  1 +
>>>>>    io/channel-socket.c                 | 16 +++++-
>>>>>    io/channel-tls.c                    |  1 +
>>>>>    io/channel-websock.c                |  1 +
>>>>>    io/channel.c                        | 73 +++++++++++++++++++++++--
>>>>>    migration/channel-block.c           |  1 +
>>>>>    scsi/qemu-pr-helper.c               |  2 +-
>>>>>    tests/qtest/tpm-emu.c               |  2 +-
>>>>>    tests/unit/test-io-channel-socket.c |  1 +
>>>>>    util/vhost-user-server.c            |  2 +-
>>>>>    15 files changed, 179 insertions(+), 11 deletions(-)
>>>>
>>>>> diff --git a/io/channel-socket.c b/io/channel-socket.c
>>>>> index b76dca9cc1..a06b24766d 100644
>>>>> --- a/io/channel-socket.c
>>>>> +++ b/io/channel-socket.c
>>>>> @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
>>>>>        }
>>>>>    #endif /* WIN32 */
>>>>> +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
>>>>> +
>>>> This covers the incoming server side socket.
>>>>
>>>> This also needs to be set in outgoing client side socket in
>>>> qio_channel_socket_connect_async
>>>
>>> Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
>>>
>>>>> @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
>>>>>    }
>>>>>    #endif /* WIN32 */
>>>>> -
>>>>>    #ifdef QEMU_MSG_ZEROCOPY
>>>>>    static int qio_channel_socket_flush(QIOChannel *ioc,
>>>>>                                        Error **errp)
>>>> Please remove this unrelated whitespace change.
>>>>
>>>>
>>>>> @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
>>>>>        return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
>>>>>    }
>>>>> +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
>>>>> +                                   const struct iovec *iov,
>>>>> +                                   size_t niov,
>>>>> +                                   Error **errp)
>>>>> +{
>>>>> +   ssize_t len = 0;
>>>>> +   ssize_t total = iov_size(iov, niov);
>>>>> +
>>>>> +   while (len < total) {
>>>>> +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
>>>>> +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
>>>>> +
>>>>> +       if (len == QIO_CHANNEL_ERR_BLOCK) {
>>>>> +            if (qemu_in_coroutine()) {
>>>>> +                qio_channel_yield(ioc, G_IO_IN);
>>>>> +            } else {
>>>>> +                qio_channel_wait(ioc, G_IO_IN);
>>>>> +            }
>>>>> +            continue;
>>>>> +       }
>>>>> +       if (len == 0) {
>>>>> +           return 0;
>>>>> +       }
>>>>> +       if (len < 0) {
>>>>> +           return -1;
>>>>> +       }
>>>>> +   }
>>>> This will busy wait burning CPU where there is a read > 0 and < total.
>>>>
>>> Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.
>> How easy would this happen?
>>
>> Another alternative is we could just return the partial len to caller then
>> we fallback to the original channel orders if it happens.  And then if it
>> mostly will never happen it'll behave merely the same as what we want.
> Well we're trying to deal with a bug where the slow and/or unreliable
> network causes channels to arrive in unexpected order. Given we know
> we're having network trouble, I wouldn't want to make more assumptions
> about things happening correctly.
>
>
> With regards,
> Daniel


Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof.

*MSG_WAITALL *(since Linux 2.2)
               This flag requests that the operation block until the full
               request is satisfied.  However, the call may still return
               less data than requested if a signal is caught, an error
               or disconnect occurs, or the next data to be received is
               of a different type than that returned.  This flag has no
               effect for datagram sockets.

Actual read ahead will be little hackish, so just confirming we all are in agreement to do actual read ahead and i can send patch? :)

Thanks
Manish Mishra
Peter Xu Nov. 22, 2022, 4:10 p.m. UTC | #11
On Tue, Nov 22, 2022 at 09:01:59PM +0530, manish.mishra wrote:
> 
> On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:
> > On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:
> > > On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> > > > On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > > > > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > > > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > > > > unread and the next read shall still return this data. This
> > > > > > support is currently added only for socket class. Extra parameter
> > > > > > 'flags' is added to io_readv calls to pass extra read flags like
> > > > > > MSG_PEEK.
> > > > > > 
> > > > > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > > > > Signed-off-by: manish.mishra<manish.mishra@nutanix.com>
> > > > > > ---
> > > > > >    chardev/char-socket.c               |  4 +-
> > > > > >    include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > > > > >    io/channel-buffer.c                 |  1 +
> > > > > >    io/channel-command.c                |  1 +
> > > > > >    io/channel-file.c                   |  1 +
> > > > > >    io/channel-null.c                   |  1 +
> > > > > >    io/channel-socket.c                 | 16 +++++-
> > > > > >    io/channel-tls.c                    |  1 +
> > > > > >    io/channel-websock.c                |  1 +
> > > > > >    io/channel.c                        | 73 +++++++++++++++++++++++--
> > > > > >    migration/channel-block.c           |  1 +
> > > > > >    scsi/qemu-pr-helper.c               |  2 +-
> > > > > >    tests/qtest/tpm-emu.c               |  2 +-
> > > > > >    tests/unit/test-io-channel-socket.c |  1 +
> > > > > >    util/vhost-user-server.c            |  2 +-
> > > > > >    15 files changed, 179 insertions(+), 11 deletions(-)
> > > > > 
> > > > > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > > > > index b76dca9cc1..a06b24766d 100644
> > > > > > --- a/io/channel-socket.c
> > > > > > +++ b/io/channel-socket.c
> > > > > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > > > > >        }
> > > > > >    #endif /* WIN32 */
> > > > > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > > > > +
> > > > > This covers the incoming server side socket.
> > > > > 
> > > > > This also needs to be set in outgoing client side socket in
> > > > > qio_channel_socket_connect_async
> > > > 
> > > > Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> > > > 
> > > > > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > > >    }
> > > > > >    #endif /* WIN32 */
> > > > > > -
> > > > > >    #ifdef QEMU_MSG_ZEROCOPY
> > > > > >    static int qio_channel_socket_flush(QIOChannel *ioc,
> > > > > >                                        Error **errp)
> > > > > Please remove this unrelated whitespace change.
> > > > > 
> > > > > 
> > > > > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > > > > >        return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > > > > >    }
> > > > > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > > > > +                                   const struct iovec *iov,
> > > > > > +                                   size_t niov,
> > > > > > +                                   Error **errp)
> > > > > > +{
> > > > > > +   ssize_t len = 0;
> > > > > > +   ssize_t total = iov_size(iov, niov);
> > > > > > +
> > > > > > +   while (len < total) {
> > > > > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > > > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > > > > +
> > > > > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > > > > +            if (qemu_in_coroutine()) {
> > > > > > +                qio_channel_yield(ioc, G_IO_IN);
> > > > > > +            } else {
> > > > > > +                qio_channel_wait(ioc, G_IO_IN);
> > > > > > +            }
> > > > > > +            continue;
> > > > > > +       }
> > > > > > +       if (len == 0) {
> > > > > > +           return 0;
> > > > > > +       }
> > > > > > +       if (len < 0) {
> > > > > > +           return -1;
> > > > > > +       }
> > > > > > +   }
> > > > > This will busy wait burning CPU where there is a read > 0 and < total.
> > > > > 
> > > > Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.
> > > How easy would this happen?
> > > 
> > > Another alternative is we could just return the partial len to caller then
> > > we fallback to the original channel orders if it happens.  And then if it
> > > mostly will never happen it'll behave merely the same as what we want.
> > Well we're trying to deal with a bug where the slow and/or unreliable
> > network causes channels to arrive in unexpected order. Given we know
> > we're having network trouble, I wouldn't want to make more assumptions
> > about things happening correctly.
> > 
> > 
> > With regards,
> > Daniel
> 
> 
> Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof.
> 
> *MSG_WAITALL *(since Linux 2.2)
>               This flag requests that the operation block until the full
>               request is satisfied.  However, the call may still return
>               less data than requested if a signal is caught, an error
>               or disconnect occurs, or the next data to be received is
>               of a different type than that returned.  This flag has no
>               effect for datagram sockets.
> 
> Actual read ahead will be little hackish, so just confirming we all are in agreement to do actual read ahead and i can send patch? :)

Yet another option is the caller handles partial PEEK and then we can sleep
in the migration code before another PEEK attempt until it reaches the full
length.

Even with that explicit sleep code IMHO it is cleaner than the read-header
flag plus things like !tls check just to avoid the handshake dead lock
itself (and if to go with this route we'd better also have a full document
on why !tls, aka, how the dead lock can happen).

Would that work?
Peter Xu Nov. 22, 2022, 4:29 p.m. UTC | #12
On Tue, Nov 22, 2022 at 11:10:18AM -0500, Peter Xu wrote:
> On Tue, Nov 22, 2022 at 09:01:59PM +0530, manish.mishra wrote:
> > 
> > On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:
> > > On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:
> > > > On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> > > > > On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > > > > > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > > > > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > > > > > unread and the next read shall still return this data. This
> > > > > > > support is currently added only for socket class. Extra parameter
> > > > > > > 'flags' is added to io_readv calls to pass extra read flags like
> > > > > > > MSG_PEEK.
> > > > > > > 
> > > > > > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > > > > > Signed-off-by: manish.mishra<manish.mishra@nutanix.com>
> > > > > > > ---
> > > > > > >    chardev/char-socket.c               |  4 +-
> > > > > > >    include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > > > > > >    io/channel-buffer.c                 |  1 +
> > > > > > >    io/channel-command.c                |  1 +
> > > > > > >    io/channel-file.c                   |  1 +
> > > > > > >    io/channel-null.c                   |  1 +
> > > > > > >    io/channel-socket.c                 | 16 +++++-
> > > > > > >    io/channel-tls.c                    |  1 +
> > > > > > >    io/channel-websock.c                |  1 +
> > > > > > >    io/channel.c                        | 73 +++++++++++++++++++++++--
> > > > > > >    migration/channel-block.c           |  1 +
> > > > > > >    scsi/qemu-pr-helper.c               |  2 +-
> > > > > > >    tests/qtest/tpm-emu.c               |  2 +-
> > > > > > >    tests/unit/test-io-channel-socket.c |  1 +
> > > > > > >    util/vhost-user-server.c            |  2 +-
> > > > > > >    15 files changed, 179 insertions(+), 11 deletions(-)
> > > > > > 
> > > > > > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > > > > > index b76dca9cc1..a06b24766d 100644
> > > > > > > --- a/io/channel-socket.c
> > > > > > > +++ b/io/channel-socket.c
> > > > > > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > > > > > >        }
> > > > > > >    #endif /* WIN32 */
> > > > > > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > > > > > +
> > > > > > This covers the incoming server side socket.
> > > > > > 
> > > > > > This also needs to be set in outgoing client side socket in
> > > > > > qio_channel_socket_connect_async
> > > > > 
> > > > > Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> > > > > 
> > > > > > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > > > >    }
> > > > > > >    #endif /* WIN32 */
> > > > > > > -
> > > > > > >    #ifdef QEMU_MSG_ZEROCOPY
> > > > > > >    static int qio_channel_socket_flush(QIOChannel *ioc,
> > > > > > >                                        Error **errp)
> > > > > > Please remove this unrelated whitespace change.
> > > > > > 
> > > > > > 
> > > > > > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > > > > > >        return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > > > > > >    }
> > > > > > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > > > > > +                                   const struct iovec *iov,
> > > > > > > +                                   size_t niov,
> > > > > > > +                                   Error **errp)
> > > > > > > +{
> > > > > > > +   ssize_t len = 0;
> > > > > > > +   ssize_t total = iov_size(iov, niov);
> > > > > > > +
> > > > > > > +   while (len < total) {
> > > > > > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > > > > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > > > > > +
> > > > > > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > > > > > +            if (qemu_in_coroutine()) {
> > > > > > > +                qio_channel_yield(ioc, G_IO_IN);
> > > > > > > +            } else {
> > > > > > > +                qio_channel_wait(ioc, G_IO_IN);
> > > > > > > +            }
> > > > > > > +            continue;
> > > > > > > +       }
> > > > > > > +       if (len == 0) {
> > > > > > > +           return 0;
> > > > > > > +       }
> > > > > > > +       if (len < 0) {
> > > > > > > +           return -1;
> > > > > > > +       }
> > > > > > > +   }
> > > > > > This will busy wait burning CPU where there is a read > 0 and < total.
> > > > > > 
> > > > > Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.
> > > > How easy would this happen?
> > > > 
> > > > Another alternative is we could just return the partial len to caller then
> > > > we fallback to the original channel orders if it happens.  And then if it
> > > > mostly will never happen it'll behave merely the same as what we want.
> > > Well we're trying to deal with a bug where the slow and/or unreliable
> > > network causes channels to arrive in unexpected order. Given we know
> > > we're having network trouble, I wouldn't want to make more assumptions
> > > about things happening correctly.
> > > 
> > > 
> > > With regards,
> > > Daniel
> > 
> > 
> > Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof.
> > 
> > *MSG_WAITALL *(since Linux 2.2)
> >               This flag requests that the operation block until the full
> >               request is satisfied.  However, the call may still return
> >               less data than requested if a signal is caught, an error
> >               or disconnect occurs, or the next data to be received is
> >               of a different type than that returned.  This flag has no
> >               effect for datagram sockets.
> > 
> > Actual read ahead will be little hackish, so just confirming we all are in agreement to do actual read ahead and i can send patch? :)
> 
> Yet another option is the caller handles partial PEEK and then we can sleep
> in the migration code before another PEEK attempt until it reaches the full
> length.
> 
> Even with that explicit sleep code IMHO it is cleaner than the read-header
> flag plus things like !tls check just to avoid the handshake dead lock
> itself (and if to go with this route we'd better also have a full document
> on why !tls, aka, how the dead lock can happen).

Nah, I forgot we're in the same condition as in the main thread.. sorry.

Then how about using qemu_co_sleep_ns_wakeable() to replace
qio_channel_yield() either above, or in the caller?
Peter Xu Nov. 22, 2022, 4:33 p.m. UTC | #13
On Tue, Nov 22, 2022 at 11:29:05AM -0500, Peter Xu wrote:
> On Tue, Nov 22, 2022 at 11:10:18AM -0500, Peter Xu wrote:
> > On Tue, Nov 22, 2022 at 09:01:59PM +0530, manish.mishra wrote:
> > > 
> > > On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:
> > > > On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:
> > > > > On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> > > > > > On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > > > > > > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > > > > > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > > > > > > unread and the next read shall still return this data. This
> > > > > > > > support is currently added only for socket class. Extra parameter
> > > > > > > > 'flags' is added to io_readv calls to pass extra read flags like
> > > > > > > > MSG_PEEK.
> > > > > > > > 
> > > > > > > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > > > > > > Signed-off-by: manish.mishra<manish.mishra@nutanix.com>
> > > > > > > > ---
> > > > > > > >    chardev/char-socket.c               |  4 +-
> > > > > > > >    include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > > > > > > >    io/channel-buffer.c                 |  1 +
> > > > > > > >    io/channel-command.c                |  1 +
> > > > > > > >    io/channel-file.c                   |  1 +
> > > > > > > >    io/channel-null.c                   |  1 +
> > > > > > > >    io/channel-socket.c                 | 16 +++++-
> > > > > > > >    io/channel-tls.c                    |  1 +
> > > > > > > >    io/channel-websock.c                |  1 +
> > > > > > > >    io/channel.c                        | 73 +++++++++++++++++++++++--
> > > > > > > >    migration/channel-block.c           |  1 +
> > > > > > > >    scsi/qemu-pr-helper.c               |  2 +-
> > > > > > > >    tests/qtest/tpm-emu.c               |  2 +-
> > > > > > > >    tests/unit/test-io-channel-socket.c |  1 +
> > > > > > > >    util/vhost-user-server.c            |  2 +-
> > > > > > > >    15 files changed, 179 insertions(+), 11 deletions(-)
> > > > > > > 
> > > > > > > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > > > > > > index b76dca9cc1..a06b24766d 100644
> > > > > > > > --- a/io/channel-socket.c
> > > > > > > > +++ b/io/channel-socket.c
> > > > > > > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > > > > > > >        }
> > > > > > > >    #endif /* WIN32 */
> > > > > > > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > > > > > > +
> > > > > > > This covers the incoming server side socket.
> > > > > > > 
> > > > > > > This also needs to be set in outgoing client side socket in
> > > > > > > qio_channel_socket_connect_async
> > > > > > 
> > > > > > Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> > > > > > 
> > > > > > > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > > > > >    }
> > > > > > > >    #endif /* WIN32 */
> > > > > > > > -
> > > > > > > >    #ifdef QEMU_MSG_ZEROCOPY
> > > > > > > >    static int qio_channel_socket_flush(QIOChannel *ioc,
> > > > > > > >                                        Error **errp)
> > > > > > > Please remove this unrelated whitespace change.
> > > > > > > 
> > > > > > > 
> > > > > > > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > > > > > > >        return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > > > > > > >    }
> > > > > > > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > > > > > > +                                   const struct iovec *iov,
> > > > > > > > +                                   size_t niov,
> > > > > > > > +                                   Error **errp)
> > > > > > > > +{
> > > > > > > > +   ssize_t len = 0;
> > > > > > > > +   ssize_t total = iov_size(iov, niov);
> > > > > > > > +
> > > > > > > > +   while (len < total) {
> > > > > > > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > > > > > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > > > > > > +
> > > > > > > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > > > > > > +            if (qemu_in_coroutine()) {
> > > > > > > > +                qio_channel_yield(ioc, G_IO_IN);
> > > > > > > > +            } else {
> > > > > > > > +                qio_channel_wait(ioc, G_IO_IN);
> > > > > > > > +            }
> > > > > > > > +            continue;
> > > > > > > > +       }
> > > > > > > > +       if (len == 0) {
> > > > > > > > +           return 0;
> > > > > > > > +       }
> > > > > > > > +       if (len < 0) {
> > > > > > > > +           return -1;
> > > > > > > > +       }
> > > > > > > > +   }
> > > > > > > This will busy wait burning CPU where there is a read > 0 and < total.
> > > > > > > 
> > > > > > Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.
> > > > > How easy would this happen?
> > > > > 
> > > > > Another alternative is we could just return the partial len to caller then
> > > > > we fallback to the original channel orders if it happens.  And then if it
> > > > > mostly will never happen it'll behave merely the same as what we want.
> > > > Well we're trying to deal with a bug where the slow and/or unreliable
> > > > network causes channels to arrive in unexpected order. Given we know
> > > > we're having network trouble, I wouldn't want to make more assumptions
> > > > about things happening correctly.
> > > > 
> > > > 
> > > > With regards,
> > > > Daniel
> > > 
> > > 
> > > Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof.
> > > 
> > > *MSG_WAITALL *(since Linux 2.2)
> > >               This flag requests that the operation block until the full
> > >               request is satisfied.  However, the call may still return
> > >               less data than requested if a signal is caught, an error
> > >               or disconnect occurs, or the next data to be received is
> > >               of a different type than that returned.  This flag has no
> > >               effect for datagram sockets.
> > > 
> > > Actual read ahead will be little hackish, so just confirming we all are in agreement to do actual read ahead and i can send patch? :)
> > 
> > Yet another option is the caller handles partial PEEK and then we can sleep
> > in the migration code before another PEEK attempt until it reaches the full
> > length.
> > 
> > Even with that explicit sleep code IMHO it is cleaner than the read-header
> > flag plus things like !tls check just to avoid the handshake dead lock
> > itself (and if to go with this route we'd better also have a full document
> > on why !tls, aka, how the dead lock can happen).
> 
> Nah, I forgot we're in the same condition as in the main thread.. sorry.
> 
> Then how about using qemu_co_sleep_ns_wakeable() to replace
> qio_channel_yield() either above, or in the caller?

A better one is qemu_co_sleep_ns().  Off-topic: I'd even think we should
have one qemu_co_sleep_realtime_ns() because currently all callers of
qemu_co_sleep_ns() is for the rt clock.
manish.mishra Nov. 22, 2022, 4:42 p.m. UTC | #14
On 22/11/22 10:03 pm, Peter Xu wrote:
> On Tue, Nov 22, 2022 at 11:29:05AM -0500, Peter Xu wrote:
>> On Tue, Nov 22, 2022 at 11:10:18AM -0500, Peter Xu wrote:
>>> On Tue, Nov 22, 2022 at 09:01:59PM +0530, manish.mishra wrote:
>>>> On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:
>>>>> On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:
>>>>>> On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
>>>>>>> On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
>>>>>>>> On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
>>>>>>>>> MSG_PEEK reads from the peek of channel, The data is treated as
>>>>>>>>> unread and the next read shall still return this data. This
>>>>>>>>> support is currently added only for socket class. Extra parameter
>>>>>>>>> 'flags' is added to io_readv calls to pass extra read flags like
>>>>>>>>> MSG_PEEK.
>>>>>>>>>
>>>>>>>>> Suggested-by: Daniel P. Berrangé <berrange@redhat.com
>>>>>>>>> Signed-off-by: manish.mishra<manish.mishra@nutanix.com>
>>>>>>>>> ---
>>>>>>>>>     chardev/char-socket.c               |  4 +-
>>>>>>>>>     include/io/channel.h                | 83 +++++++++++++++++++++++++++++
>>>>>>>>>     io/channel-buffer.c                 |  1 +
>>>>>>>>>     io/channel-command.c                |  1 +
>>>>>>>>>     io/channel-file.c                   |  1 +
>>>>>>>>>     io/channel-null.c                   |  1 +
>>>>>>>>>     io/channel-socket.c                 | 16 +++++-
>>>>>>>>>     io/channel-tls.c                    |  1 +
>>>>>>>>>     io/channel-websock.c                |  1 +
>>>>>>>>>     io/channel.c                        | 73 +++++++++++++++++++++++--
>>>>>>>>>     migration/channel-block.c           |  1 +
>>>>>>>>>     scsi/qemu-pr-helper.c               |  2 +-
>>>>>>>>>     tests/qtest/tpm-emu.c               |  2 +-
>>>>>>>>>     tests/unit/test-io-channel-socket.c |  1 +
>>>>>>>>>     util/vhost-user-server.c            |  2 +-
>>>>>>>>>     15 files changed, 179 insertions(+), 11 deletions(-)
>>>>>>>>> diff --git a/io/channel-socket.c b/io/channel-socket.c
>>>>>>>>> index b76dca9cc1..a06b24766d 100644
>>>>>>>>> --- a/io/channel-socket.c
>>>>>>>>> +++ b/io/channel-socket.c
>>>>>>>>> @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
>>>>>>>>>         }
>>>>>>>>>     #endif /* WIN32 */
>>>>>>>>> +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
>>>>>>>>> +
>>>>>>>> This covers the incoming server side socket.
>>>>>>>>
>>>>>>>> This also needs to be set in outgoing client side socket in
>>>>>>>> qio_channel_socket_connect_async
>>>>>>> Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
>>>>>>>
>>>>>>>>> @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
>>>>>>>>>     }
>>>>>>>>>     #endif /* WIN32 */
>>>>>>>>> -
>>>>>>>>>     #ifdef QEMU_MSG_ZEROCOPY
>>>>>>>>>     static int qio_channel_socket_flush(QIOChannel *ioc,
>>>>>>>>>                                         Error **errp)
>>>>>>>> Please remove this unrelated whitespace change.
>>>>>>>>
>>>>>>>>
>>>>>>>>> @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
>>>>>>>>>         return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
>>>>>>>>>     }
>>>>>>>>> +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
>>>>>>>>> +                                   const struct iovec *iov,
>>>>>>>>> +                                   size_t niov,
>>>>>>>>> +                                   Error **errp)
>>>>>>>>> +{
>>>>>>>>> +   ssize_t len = 0;
>>>>>>>>> +   ssize_t total = iov_size(iov, niov);
>>>>>>>>> +
>>>>>>>>> +   while (len < total) {
>>>>>>>>> +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
>>>>>>>>> +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
>>>>>>>>> +
>>>>>>>>> +       if (len == QIO_CHANNEL_ERR_BLOCK) {
>>>>>>>>> +            if (qemu_in_coroutine()) {
>>>>>>>>> +                qio_channel_yield(ioc, G_IO_IN);
>>>>>>>>> +            } else {
>>>>>>>>> +                qio_channel_wait(ioc, G_IO_IN);
>>>>>>>>> +            }
>>>>>>>>> +            continue;
>>>>>>>>> +       }
>>>>>>>>> +       if (len == 0) {
>>>>>>>>> +           return 0;
>>>>>>>>> +       }
>>>>>>>>> +       if (len < 0) {
>>>>>>>>> +           return -1;
>>>>>>>>> +       }
>>>>>>>>> +   }
>>>>>>>> This will busy wait burning CPU where there is a read > 0 and < total.
>>>>>>>>
>>>>>>> Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.
>>>>>> How easy would this happen?
>>>>>>
>>>>>> Another alternative is we could just return the partial len to caller then
>>>>>> we fallback to the original channel orders if it happens.  And then if it
>>>>>> mostly will never happen it'll behave merely the same as what we want.
>>>>> Well we're trying to deal with a bug where the slow and/or unreliable
>>>>> network causes channels to arrive in unexpected order. Given we know
>>>>> we're having network trouble, I wouldn't want to make more assumptions
>>>>> about things happening correctly.
>>>>>
>>>>>
>>>>> With regards,
>>>>> Daniel
>>>>
>>>> Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof.
>>>>
>>>> *MSG_WAITALL *(since Linux 2.2)
>>>>                This flag requests that the operation block until the full
>>>>                request is satisfied.  However, the call may still return
>>>>                less data than requested if a signal is caught, an error
>>>>                or disconnect occurs, or the next data to be received is
>>>>                of a different type than that returned.  This flag has no
>>>>                effect for datagram sockets.
>>>>
>>>> Actual read ahead will be little hackish, so just confirming we all are in agreement to do actual read ahead and i can send patch? :)
>>> Yet another option is the caller handles partial PEEK and then we can sleep
>>> in the migration code before another PEEK attempt until it reaches the full
>>> length.
>>>
>>> Even with that explicit sleep code IMHO it is cleaner than the read-header
>>> flag plus things like !tls check just to avoid the handshake dead lock
>>> itself (and if to go with this route we'd better also have a full document
>>> on why !tls, aka, how the dead lock can happen).
>> Nah, I forgot we're in the same condition as in the main thread.. sorry.
>>
>> Then how about using qemu_co_sleep_ns_wakeable() to replace
>> qio_channel_yield() either above, or in the caller?
> A better one is qemu_co_sleep_ns().  Off-topic: I'd even think we should
> have one qemu_co_sleep_realtime_ns() because currently all callers of
I am not aware of this :) , will check it.
> qemu_co_sleep_ns() is for the rt clock.


Yes that also works Peter. In that case, should i have a default time or take it from upper layers. And for live migration does something like of scale 1ms works?

Thanks

Manish Mishra

>
Peter Xu Nov. 22, 2022, 5:16 p.m. UTC | #15
On Tue, Nov 22, 2022 at 10:12:25PM +0530, manish.mishra wrote:
> 
> On 22/11/22 10:03 pm, Peter Xu wrote:
> > On Tue, Nov 22, 2022 at 11:29:05AM -0500, Peter Xu wrote:
> > > On Tue, Nov 22, 2022 at 11:10:18AM -0500, Peter Xu wrote:
> > > > On Tue, Nov 22, 2022 at 09:01:59PM +0530, manish.mishra wrote:
> > > > > On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:
> > > > > > On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:
> > > > > > > On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> > > > > > > > On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > > > > > > > > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > > > > > > > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > > > > > > > > unread and the next read shall still return this data. This
> > > > > > > > > > support is currently added only for socket class. Extra parameter
> > > > > > > > > > 'flags' is added to io_readv calls to pass extra read flags like
> > > > > > > > > > MSG_PEEK.
> > > > > > > > > > 
> > > > > > > > > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > > > > > > > > Signed-off-by: manish.mishra<manish.mishra@nutanix.com>
> > > > > > > > > > ---
> > > > > > > > > >     chardev/char-socket.c               |  4 +-
> > > > > > > > > >     include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > > > > > > > > >     io/channel-buffer.c                 |  1 +
> > > > > > > > > >     io/channel-command.c                |  1 +
> > > > > > > > > >     io/channel-file.c                   |  1 +
> > > > > > > > > >     io/channel-null.c                   |  1 +
> > > > > > > > > >     io/channel-socket.c                 | 16 +++++-
> > > > > > > > > >     io/channel-tls.c                    |  1 +
> > > > > > > > > >     io/channel-websock.c                |  1 +
> > > > > > > > > >     io/channel.c                        | 73 +++++++++++++++++++++++--
> > > > > > > > > >     migration/channel-block.c           |  1 +
> > > > > > > > > >     scsi/qemu-pr-helper.c               |  2 +-
> > > > > > > > > >     tests/qtest/tpm-emu.c               |  2 +-
> > > > > > > > > >     tests/unit/test-io-channel-socket.c |  1 +
> > > > > > > > > >     util/vhost-user-server.c            |  2 +-
> > > > > > > > > >     15 files changed, 179 insertions(+), 11 deletions(-)
> > > > > > > > > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > > > > > > > > index b76dca9cc1..a06b24766d 100644
> > > > > > > > > > --- a/io/channel-socket.c
> > > > > > > > > > +++ b/io/channel-socket.c
> > > > > > > > > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > > > > > > > > >         }
> > > > > > > > > >     #endif /* WIN32 */
> > > > > > > > > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > > > > > > > > +
> > > > > > > > > This covers the incoming server side socket.
> > > > > > > > > 
> > > > > > > > > This also needs to be set in outgoing client side socket in
> > > > > > > > > qio_channel_socket_connect_async
> > > > > > > > Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> > > > > > > > 
> > > > > > > > > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > > > > > > >     }
> > > > > > > > > >     #endif /* WIN32 */
> > > > > > > > > > -
> > > > > > > > > >     #ifdef QEMU_MSG_ZEROCOPY
> > > > > > > > > >     static int qio_channel_socket_flush(QIOChannel *ioc,
> > > > > > > > > >                                         Error **errp)
> > > > > > > > > Please remove this unrelated whitespace change.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > > > > > > > > >         return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > > > > > > > > >     }
> > > > > > > > > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > > > > > > > > +                                   const struct iovec *iov,
> > > > > > > > > > +                                   size_t niov,
> > > > > > > > > > +                                   Error **errp)
> > > > > > > > > > +{
> > > > > > > > > > +   ssize_t len = 0;
> > > > > > > > > > +   ssize_t total = iov_size(iov, niov);
> > > > > > > > > > +
> > > > > > > > > > +   while (len < total) {
> > > > > > > > > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > > > > > > > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > > > > > > > > +
> > > > > > > > > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > > > > > > > > +            if (qemu_in_coroutine()) {
> > > > > > > > > > +                qio_channel_yield(ioc, G_IO_IN);
> > > > > > > > > > +            } else {
> > > > > > > > > > +                qio_channel_wait(ioc, G_IO_IN);
> > > > > > > > > > +            }
> > > > > > > > > > +            continue;
> > > > > > > > > > +       }
> > > > > > > > > > +       if (len == 0) {
> > > > > > > > > > +           return 0;
> > > > > > > > > > +       }
> > > > > > > > > > +       if (len < 0) {
> > > > > > > > > > +           return -1;
> > > > > > > > > > +       }
> > > > > > > > > > +   }
> > > > > > > > > This will busy wait burning CPU where there is a read > 0 and < total.
> > > > > > > > > 
> > > > > > > > Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.
> > > > > > > How easy would this happen?
> > > > > > > 
> > > > > > > Another alternative is we could just return the partial len to caller then
> > > > > > > we fallback to the original channel orders if it happens.  And then if it
> > > > > > > mostly will never happen it'll behave merely the same as what we want.
> > > > > > Well we're trying to deal with a bug where the slow and/or unreliable
> > > > > > network causes channels to arrive in unexpected order. Given we know
> > > > > > we're having network trouble, I wouldn't want to make more assumptions
> > > > > > about things happening correctly.
> > > > > > 
> > > > > > 
> > > > > > With regards,
> > > > > > Daniel
> > > > > 
> > > > > Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof.
> > > > > 
> > > > > *MSG_WAITALL *(since Linux 2.2)
> > > > >                This flag requests that the operation block until the full
> > > > >                request is satisfied.  However, the call may still return
> > > > >                less data than requested if a signal is caught, an error
> > > > >                or disconnect occurs, or the next data to be received is
> > > > >                of a different type than that returned.  This flag has no
> > > > >                effect for datagram sockets.
> > > > > 
> > > > > Actual read ahead will be little hackish, so just confirming we all are in agreement to do actual read ahead and i can send patch? :)
> > > > Yet another option is the caller handles partial PEEK and then we can sleep
> > > > in the migration code before another PEEK attempt until it reaches the full
> > > > length.
> > > > 
> > > > Even with that explicit sleep code IMHO it is cleaner than the read-header
> > > > flag plus things like !tls check just to avoid the handshake dead lock
> > > > itself (and if to go with this route we'd better also have a full document
> > > > on why !tls, aka, how the dead lock can happen).
> > > Nah, I forgot we're in the same condition as in the main thread.. sorry.
> > > 
> > > Then how about using qemu_co_sleep_ns_wakeable() to replace
> > > qio_channel_yield() either above, or in the caller?
> > A better one is qemu_co_sleep_ns().  Off-topic: I'd even think we should
> > have one qemu_co_sleep_realtime_ns() because currently all callers of
> I am not aware of this :) , will check it.
> > qemu_co_sleep_ns() is for the rt clock.
> 
> 
> Yes that also works Peter. In that case, should i have a default time or take it from upper layers. And for live migration does something like of scale 1ms works?

Sounds good to me on migration side.  When making it formal we'd also want
to know how Juan/Dave think.

But let's also wait for Dan's input about this before going forward.  If
the io code wants an _eof() version of PEEK then maybe we'd better do the
timeout-yield there even if not as elegant as G_IO_IN.  IIUC it's a matter
of whether we want to allow the PEEK interface return partial len.

Thanks,
Daniel P. Berrangé Nov. 22, 2022, 5:31 p.m. UTC | #16
On Tue, Nov 22, 2022 at 12:16:08PM -0500, Peter Xu wrote:
> On Tue, Nov 22, 2022 at 10:12:25PM +0530, manish.mishra wrote:
> > 
> > On 22/11/22 10:03 pm, Peter Xu wrote:
> > > On Tue, Nov 22, 2022 at 11:29:05AM -0500, Peter Xu wrote:
> > > > On Tue, Nov 22, 2022 at 11:10:18AM -0500, Peter Xu wrote:
> > > > > On Tue, Nov 22, 2022 at 09:01:59PM +0530, manish.mishra wrote:
> > > > > > On 22/11/22 8:19 pm, Daniel P. Berrangé wrote:
> > > > > > > On Tue, Nov 22, 2022 at 09:41:01AM -0500, Peter Xu wrote:
> > > > > > > > On Tue, Nov 22, 2022 at 02:38:53PM +0530, manish.mishra wrote:
> > > > > > > > > On 22/11/22 2:30 pm, Daniel P. Berrangé wrote:
> > > > > > > > > > On Sat, Nov 19, 2022 at 09:36:14AM +0000, manish.mishra wrote:
> > > > > > > > > > > MSG_PEEK reads from the peek of channel, The data is treated as
> > > > > > > > > > > unread and the next read shall still return this data. This
> > > > > > > > > > > support is currently added only for socket class. Extra parameter
> > > > > > > > > > > 'flags' is added to io_readv calls to pass extra read flags like
> > > > > > > > > > > MSG_PEEK.
> > > > > > > > > > > 
> > > > > > > > > > > Suggested-by: Daniel P. Berrangé <berrange@redhat.com
> > > > > > > > > > > Signed-off-by: manish.mishra<manish.mishra@nutanix.com>
> > > > > > > > > > > ---
> > > > > > > > > > >     chardev/char-socket.c               |  4 +-
> > > > > > > > > > >     include/io/channel.h                | 83 +++++++++++++++++++++++++++++
> > > > > > > > > > >     io/channel-buffer.c                 |  1 +
> > > > > > > > > > >     io/channel-command.c                |  1 +
> > > > > > > > > > >     io/channel-file.c                   |  1 +
> > > > > > > > > > >     io/channel-null.c                   |  1 +
> > > > > > > > > > >     io/channel-socket.c                 | 16 +++++-
> > > > > > > > > > >     io/channel-tls.c                    |  1 +
> > > > > > > > > > >     io/channel-websock.c                |  1 +
> > > > > > > > > > >     io/channel.c                        | 73 +++++++++++++++++++++++--
> > > > > > > > > > >     migration/channel-block.c           |  1 +
> > > > > > > > > > >     scsi/qemu-pr-helper.c               |  2 +-
> > > > > > > > > > >     tests/qtest/tpm-emu.c               |  2 +-
> > > > > > > > > > >     tests/unit/test-io-channel-socket.c |  1 +
> > > > > > > > > > >     util/vhost-user-server.c            |  2 +-
> > > > > > > > > > >     15 files changed, 179 insertions(+), 11 deletions(-)
> > > > > > > > > > > diff --git a/io/channel-socket.c b/io/channel-socket.c
> > > > > > > > > > > index b76dca9cc1..a06b24766d 100644
> > > > > > > > > > > --- a/io/channel-socket.c
> > > > > > > > > > > +++ b/io/channel-socket.c
> > > > > > > > > > > @@ -406,6 +406,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
> > > > > > > > > > >         }
> > > > > > > > > > >     #endif /* WIN32 */
> > > > > > > > > > > +    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
> > > > > > > > > > > +
> > > > > > > > > > This covers the incoming server side socket.
> > > > > > > > > > 
> > > > > > > > > > This also needs to be set in outgoing client side socket in
> > > > > > > > > > qio_channel_socket_connect_async
> > > > > > > > > Yes sorry, i considered only current use-case, but as it is generic one both should be there. Thanks will update it.
> > > > > > > > > 
> > > > > > > > > > > @@ -705,7 +718,6 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
> > > > > > > > > > >     }
> > > > > > > > > > >     #endif /* WIN32 */
> > > > > > > > > > > -
> > > > > > > > > > >     #ifdef QEMU_MSG_ZEROCOPY
> > > > > > > > > > >     static int qio_channel_socket_flush(QIOChannel *ioc,
> > > > > > > > > > >                                         Error **errp)
> > > > > > > > > > Please remove this unrelated whitespace change.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > > @@ -109,6 +117,37 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
> > > > > > > > > > >         return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
> > > > > > > > > > >     }
> > > > > > > > > > > +int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
> > > > > > > > > > > +                                   const struct iovec *iov,
> > > > > > > > > > > +                                   size_t niov,
> > > > > > > > > > > +                                   Error **errp)
> > > > > > > > > > > +{
> > > > > > > > > > > +   ssize_t len = 0;
> > > > > > > > > > > +   ssize_t total = iov_size(iov, niov);
> > > > > > > > > > > +
> > > > > > > > > > > +   while (len < total) {
> > > > > > > > > > > +       len = qio_channel_readv_full(ioc, iov, niov, NULL,
> > > > > > > > > > > +                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
> > > > > > > > > > > +
> > > > > > > > > > > +       if (len == QIO_CHANNEL_ERR_BLOCK) {
> > > > > > > > > > > +            if (qemu_in_coroutine()) {
> > > > > > > > > > > +                qio_channel_yield(ioc, G_IO_IN);
> > > > > > > > > > > +            } else {
> > > > > > > > > > > +                qio_channel_wait(ioc, G_IO_IN);
> > > > > > > > > > > +            }
> > > > > > > > > > > +            continue;
> > > > > > > > > > > +       }
> > > > > > > > > > > +       if (len == 0) {
> > > > > > > > > > > +           return 0;
> > > > > > > > > > > +       }
> > > > > > > > > > > +       if (len < 0) {
> > > > > > > > > > > +           return -1;
> > > > > > > > > > > +       }
> > > > > > > > > > > +   }
> > > > > > > > > > This will busy wait burning CPU where there is a read > 0 and < total.
> > > > > > > > > > 
> > > > > > > > > Daniel, i could use MSG_WAITALL too if that works but then we will lose opportunity to yield. Or if you have some other idea.
> > > > > > > > How easy would this happen?
> > > > > > > > 
> > > > > > > > Another alternative is we could just return the partial len to caller then
> > > > > > > > we fallback to the original channel orders if it happens.  And then if it
> > > > > > > > mostly will never happen it'll behave merely the same as what we want.
> > > > > > > Well we're trying to deal with a bug where the slow and/or unreliable
> > > > > > > network causes channels to arrive in unexpected order. Given we know
> > > > > > > we're having network trouble, I wouldn't want to make more assumptions
> > > > > > > about things happening correctly.
> > > > > > > 
> > > > > > > 
> > > > > > > With regards,
> > > > > > > Daniel
> > > > > > 
> > > > > > Peter, I have seen MSG_PEEK used in combination with MSG_WAITALL, but looks like even though chances are less it can still return partial data even with multiple retries for signal case, so is not full proof.
> > > > > > 
> > > > > > *MSG_WAITALL *(since Linux 2.2)
> > > > > >                This flag requests that the operation block until the full
> > > > > >                request is satisfied.  However, the call may still return
> > > > > >                less data than requested if a signal is caught, an error
> > > > > >                or disconnect occurs, or the next data to be received is
> > > > > >                of a different type than that returned.  This flag has no
> > > > > >                effect for datagram sockets.
> > > > > > 
> > > > > > Actual read ahead will be little hackish, so just confirming we all are in agreement to do actual read ahead and i can send patch? :)
> > > > > Yet another option is the caller handles partial PEEK and then we can sleep
> > > > > in the migration code before another PEEK attempt until it reaches the full
> > > > > length.
> > > > > 
> > > > > Even with that explicit sleep code IMHO it is cleaner than the read-header
> > > > > flag plus things like !tls check just to avoid the handshake dead lock
> > > > > itself (and if to go with this route we'd better also have a full document
> > > > > on why !tls, aka, how the dead lock can happen).
> > > > Nah, I forgot we're in the same condition as in the main thread.. sorry.
> > > > 
> > > > Then how about using qemu_co_sleep_ns_wakeable() to replace
> > > > qio_channel_yield() either above, or in the caller?
> > > A better one is qemu_co_sleep_ns().  Off-topic: I'd even think we should
> > > have one qemu_co_sleep_realtime_ns() because currently all callers of
> > I am not aware of this :) , will check it.
> > > qemu_co_sleep_ns() is for the rt clock.
> > 
> > 
> > Yes that also works Peter. In that case, should i have a default time or take it from upper layers. And for live migration does something like of scale 1ms works?
> 
> Sounds good to me on migration side.  When making it formal we'd also want
> to know how Juan/Dave think.
> 
> But let's also wait for Dan's input about this before going forward.  If
> the io code wants an _eof() version of PEEK then maybe we'd better do the
> timeout-yield there even if not as elegant as G_IO_IN.  IIUC it's a matter
> of whether we want to allow the PEEK interface return partial len.

I don't think we should add an _eof() version with PEEK, because its
impossible to implement  sanely. If migration caller wants to busy
wait, or do a coroutine sleep it can do that.

With regards,
Daniel
diff mbox series

Patch

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 879564aa8a..5afce9a464 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -283,11 +283,11 @@  static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len)
     if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
         ret = qio_channel_readv_full(s->ioc, &iov, 1,
                                      &msgfds, &msgfds_num,
-                                     NULL);
+                                     0, NULL);
     } else {
         ret = qio_channel_readv_full(s->ioc, &iov, 1,
                                      NULL, NULL,
-                                     NULL);
+                                     0, NULL);
     }
 
     if (msgfds_num) {
diff --git a/include/io/channel.h b/include/io/channel.h
index c680ee7480..cbcde4b88f 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -34,6 +34,8 @@  OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 
 #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
 
+#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
+
 typedef enum QIOChannelFeature QIOChannelFeature;
 
 enum QIOChannelFeature {
@@ -41,6 +43,7 @@  enum QIOChannelFeature {
     QIO_CHANNEL_FEATURE_SHUTDOWN,
     QIO_CHANNEL_FEATURE_LISTEN,
     QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
+    QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
 };
 
 
@@ -114,6 +117,7 @@  struct QIOChannelClass {
                         size_t niov,
                         int **fds,
                         size_t *nfds,
+                        int flags,
                         Error **errp);
     int (*io_close)(QIOChannel *ioc,
                     Error **errp);
@@ -188,6 +192,7 @@  void qio_channel_set_name(QIOChannel *ioc,
  * @niov: the length of the @iov array
  * @fds: pointer to an array that will received file handles
  * @nfds: pointer filled with number of elements in @fds on return
+ * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  * Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@  ssize_t qio_channel_readv_full(QIOChannel *ioc,
                                size_t niov,
                                int **fds,
                                size_t *nfds,
+                               int flags,
                                Error **errp);
 
 
@@ -300,6 +306,34 @@  int qio_channel_readv_all_eof(QIOChannel *ioc,
                               size_t niov,
                               Error **errp);
 
+/**
+ * qio_channel_readv_peek_all_eof:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data into
+ * @niov: the length of the @iov array
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read data from the peek of IO channel without
+ * actually removing it from channel buffer, storing
+ * it in the memory regions referenced by @iov. Each
+ * element in the @iov will be fully populated with
+ * data before the next one is used. The @niov
+ * parameter specifies the total number of elements
+ * in @iov.
+ *
+ * The function will wait for all requested data
+ * to be read, yielding from the current coroutine
+ * if required.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ *          occurs without data, or -1 on error
+ */
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+                                   const struct iovec *iov,
+                                   size_t niov,
+                                   Error **errp);
+
+
 /**
  * qio_channel_readv_all:
  * @ioc: the channel object
@@ -328,6 +362,34 @@  int qio_channel_readv_all(QIOChannel *ioc,
                           Error **errp);
 
 
+/**
+ * qio_channel_readv_peek_all:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data into
+ * @niov: the length of the @iov array
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Read data from the the peek of IO channel without
+ * removing from channel buffer, storing it in the
+ * memory regions referenced by @iov. Each element
+ * in the @iov will be fully populated with data
+ * before the next one is used. The @niov parameter
+ * specifies the total number of elements in @iov.
+ *
+ * The function will wait for all requested data
+ * to be read, yielding from the current coroutine
+ * if required.
+ *
+ * If end-of-file occurs before all requested data
+ * has been read, an error will be reported.
+ *
+ * Returns: 0 if all bytes were read, or -1 on error
+ */
+int qio_channel_readv_peek_all(QIOChannel *ioc,
+                               const struct iovec *iov,
+                               size_t niov,
+                               Error **errp);
+
 /**
  * qio_channel_writev_all:
  * @ioc: the channel object
@@ -456,6 +518,27 @@  int qio_channel_read_all(QIOChannel *ioc,
                          size_t buflen,
                          Error **errp);
 
+/**
+ * qio_channel_read_peek_all:
+ * @ioc: the channel object
+ * @buf: the memory region to read data into
+ * @buflen: the number of bytes to @buf
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Reads @buflen bytes from the peek of channel into @buf without
+ * removing it from channel buffer, possibly blocking or (if the
+ * channel is non-blocking) yielding from the current coroutine
+ * multiple times until the entire content is read. If end-of-file
+ * occurs it will return an error rather than a short-read. Otherwise
+ * behaves as qio_channel_read().
+ *
+ * Returns: 0 if all bytes were read, or -1 on error
+ */
+int qio_channel_read_peek_all(QIOChannel *ioc,
+                              const char *buf,
+                              size_t buflen,
+                              Error **errp);
+
 /**
  * qio_channel_write_all:
  * @ioc: the channel object
diff --git a/io/channel-buffer.c b/io/channel-buffer.c
index bf52011be2..8096180f85 100644
--- a/io/channel-buffer.c
+++ b/io/channel-buffer.c
@@ -54,6 +54,7 @@  static ssize_t qio_channel_buffer_readv(QIOChannel *ioc,
                                         size_t niov,
                                         int **fds,
                                         size_t *nfds,
+                                        int flags,
                                         Error **errp)
 {
     QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
diff --git a/io/channel-command.c b/io/channel-command.c
index 74516252ba..e7edd091af 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -203,6 +203,7 @@  static ssize_t qio_channel_command_readv(QIOChannel *ioc,
                                          size_t niov,
                                          int **fds,
                                          size_t *nfds,
+                                         int flags,
                                          Error **errp)
 {
     QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
diff --git a/io/channel-file.c b/io/channel-file.c
index b67687c2aa..d76663e6ae 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -86,6 +86,7 @@  static ssize_t qio_channel_file_readv(QIOChannel *ioc,
                                       size_t niov,
                                       int **fds,
                                       size_t *nfds,
+                                      int flags,
                                       Error **errp)
 {
     QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
diff --git a/io/channel-null.c b/io/channel-null.c
index 75e3781507..4fafdb770d 100644
--- a/io/channel-null.c
+++ b/io/channel-null.c
@@ -60,6 +60,7 @@  qio_channel_null_readv(QIOChannel *ioc,
                        size_t niov,
                        int **fds G_GNUC_UNUSED,
                        size_t *nfds G_GNUC_UNUSED,
+                       int flags,
                        Error **errp)
 {
     QIOChannelNull *nioc = QIO_CHANNEL_NULL(ioc);
diff --git a/io/channel-socket.c b/io/channel-socket.c
index b76dca9cc1..a06b24766d 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -406,6 +406,8 @@  qio_channel_socket_accept(QIOChannelSocket *ioc,
     }
 #endif /* WIN32 */
 
+    qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
+
     trace_qio_channel_socket_accept_complete(ioc, cioc, cioc->fd);
     return cioc;
 
@@ -496,6 +498,7 @@  static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
                                         size_t niov,
                                         int **fds,
                                         size_t *nfds,
+                                        int flags,
                                         Error **errp)
 {
     QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
@@ -517,6 +520,10 @@  static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
 
     }
 
+    if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
+        sflags |= MSG_PEEK;
+    }
+
  retry:
     ret = recvmsg(sioc->fd, &msg, sflags);
     if (ret < 0) {
@@ -624,11 +631,17 @@  static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
                                         size_t niov,
                                         int **fds,
                                         size_t *nfds,
+                                        int flags,
                                         Error **errp)
 {
     QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
     ssize_t done = 0;
     ssize_t i;
+    int sflags = 0;
+
+    if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
+        sflags |= MSG_PEEK;
+    }
 
     for (i = 0; i < niov; i++) {
         ssize_t ret;
@@ -636,7 +649,7 @@  static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
         ret = recv(sioc->fd,
                    iov[i].iov_base,
                    iov[i].iov_len,
-                   0);
+                   sflags);
         if (ret < 0) {
             if (errno == EAGAIN) {
                 if (done) {
@@ -705,7 +718,6 @@  static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
 }
 #endif /* WIN32 */
 
-
 #ifdef QEMU_MSG_ZEROCOPY
 static int qio_channel_socket_flush(QIOChannel *ioc,
                                     Error **errp)
diff --git a/io/channel-tls.c b/io/channel-tls.c
index 4ce890a538..c730cb8ec5 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -260,6 +260,7 @@  static ssize_t qio_channel_tls_readv(QIOChannel *ioc,
                                      size_t niov,
                                      int **fds,
                                      size_t *nfds,
+                                     int flags,
                                      Error **errp)
 {
     QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
diff --git a/io/channel-websock.c b/io/channel-websock.c
index fb4932ade7..a12acc27cf 100644
--- a/io/channel-websock.c
+++ b/io/channel-websock.c
@@ -1081,6 +1081,7 @@  static ssize_t qio_channel_websock_readv(QIOChannel *ioc,
                                          size_t niov,
                                          int **fds,
                                          size_t *nfds,
+                                         int flags,
                                          Error **errp)
 {
     QIOChannelWebsock *wioc = QIO_CHANNEL_WEBSOCK(ioc);
diff --git a/io/channel.c b/io/channel.c
index 0640941ac5..23c8752918 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -52,6 +52,7 @@  ssize_t qio_channel_readv_full(QIOChannel *ioc,
                                size_t niov,
                                int **fds,
                                size_t *nfds,
+                               int flags,
                                Error **errp)
 {
     QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
@@ -63,7 +64,14 @@  ssize_t qio_channel_readv_full(QIOChannel *ioc,
         return -1;
     }
 
-    return klass->io_readv(ioc, iov, niov, fds, nfds, errp);
+    if ((flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) &&
+        !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
+        error_setg_errno(errp, EINVAL,
+                         "Channel does not support peek read");
+        return -1;
+    }
+
+    return klass->io_readv(ioc, iov, niov, fds, nfds, flags, errp);
 }
 
 
@@ -109,6 +117,37 @@  int qio_channel_readv_all_eof(QIOChannel *ioc,
     return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
 }
 
+int qio_channel_readv_peek_all_eof(QIOChannel *ioc,
+                                   const struct iovec *iov,
+                                   size_t niov,
+                                   Error **errp)
+{
+   ssize_t len = 0;
+   ssize_t total = iov_size(iov, niov);
+
+   while (len < total) {
+       len = qio_channel_readv_full(ioc, iov, niov, NULL,
+                                    NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
+
+       if (len == QIO_CHANNEL_ERR_BLOCK) {
+            if (qemu_in_coroutine()) {
+                qio_channel_yield(ioc, G_IO_IN);
+            } else {
+                qio_channel_wait(ioc, G_IO_IN);
+            }
+            continue;
+       }
+       if (len == 0) {
+           return 0;
+       }
+       if (len < 0) {
+           return -1;
+       }
+   }
+
+   return 1;
+}
+
 int qio_channel_readv_all(QIOChannel *ioc,
                           const struct iovec *iov,
                           size_t niov,
@@ -117,6 +156,24 @@  int qio_channel_readv_all(QIOChannel *ioc,
     return qio_channel_readv_full_all(ioc, iov, niov, NULL, NULL, errp);
 }
 
+int qio_channel_readv_peek_all(QIOChannel *ioc,
+                               const struct iovec *iov,
+                               size_t niov,
+                               Error **errp)
+{
+    int ret = qio_channel_readv_peek_all_eof(ioc, iov, niov, errp);
+
+    if (ret == 0) {
+        error_setg(errp, "Unexpected end-of-file before all data were read");
+        return -1;
+    }
+    if (ret == 1) {
+        return 0;
+    }
+
+    return ret;
+}
+
 int qio_channel_readv_full_all_eof(QIOChannel *ioc,
                                    const struct iovec *iov,
                                    size_t niov,
@@ -146,7 +203,7 @@  int qio_channel_readv_full_all_eof(QIOChannel *ioc,
     while ((nlocal_iov > 0) || local_fds) {
         ssize_t len;
         len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
-                                     local_nfds, errp);
+                                     local_nfds, 0, errp);
         if (len == QIO_CHANNEL_ERR_BLOCK) {
             if (qemu_in_coroutine()) {
                 qio_channel_yield(ioc, G_IO_IN);
@@ -284,7 +341,7 @@  ssize_t qio_channel_readv(QIOChannel *ioc,
                           size_t niov,
                           Error **errp)
 {
-    return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, errp);
+    return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, 0, errp);
 }
 
 
@@ -303,7 +360,7 @@  ssize_t qio_channel_read(QIOChannel *ioc,
                          Error **errp)
 {
     struct iovec iov = { .iov_base = buf, .iov_len = buflen };
-    return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, errp);
+    return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, 0, errp);
 }
 
 
@@ -336,6 +393,14 @@  int qio_channel_read_all(QIOChannel *ioc,
     return qio_channel_readv_all(ioc, &iov, 1, errp);
 }
 
+int qio_channel_read_peek_all(QIOChannel *ioc,
+                              const char *buf,
+                              size_t buflen,
+                              Error **errp)
+{
+    struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
+    return qio_channel_readv_peek_all(ioc, &iov, 1, errp);
+}
 
 int qio_channel_write_all(QIOChannel *ioc,
                           const char *buf,
diff --git a/migration/channel-block.c b/migration/channel-block.c
index c55c8c93ce..0b0deeb919 100644
--- a/migration/channel-block.c
+++ b/migration/channel-block.c
@@ -53,6 +53,7 @@  qio_channel_block_readv(QIOChannel *ioc,
                         size_t niov,
                         int **fds,
                         size_t *nfds,
+                        int flags,
                         Error **errp)
 {
     QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
index 196b78c00d..199227a556 100644
--- a/scsi/qemu-pr-helper.c
+++ b/scsi/qemu-pr-helper.c
@@ -614,7 +614,7 @@  static int coroutine_fn prh_read(PRHelperClient *client, void *buf, int sz,
         iov.iov_base = buf;
         iov.iov_len = sz;
         n_read = qio_channel_readv_full(QIO_CHANNEL(client->ioc), &iov, 1,
-                                        &fds, &nfds, errp);
+                                        &fds, &nfds, 0, errp);
 
         if (n_read == QIO_CHANNEL_ERR_BLOCK) {
             qio_channel_yield(QIO_CHANNEL(client->ioc), G_IO_IN);
diff --git a/tests/qtest/tpm-emu.c b/tests/qtest/tpm-emu.c
index 2994d1cf42..3cf1acaf7d 100644
--- a/tests/qtest/tpm-emu.c
+++ b/tests/qtest/tpm-emu.c
@@ -106,7 +106,7 @@  void *tpm_emu_ctrl_thread(void *data)
         int *pfd = NULL;
         size_t nfd = 0;
 
-        qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, &error_abort);
+        qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, 0, &error_abort);
         cmd = be32_to_cpu(cmd);
         g_assert_cmpint(cmd, ==, CMD_SET_DATAFD);
         g_assert_cmpint(nfd, ==, 1);
diff --git a/tests/unit/test-io-channel-socket.c b/tests/unit/test-io-channel-socket.c
index b36a5d972a..b964bb202d 100644
--- a/tests/unit/test-io-channel-socket.c
+++ b/tests/unit/test-io-channel-socket.c
@@ -460,6 +460,7 @@  static void test_io_channel_unix_fd_pass(void)
                            G_N_ELEMENTS(iorecv),
                            &fdrecv,
                            &nfdrecv,
+                           0,
                            &error_abort);
 
     g_assert(nfdrecv == G_N_ELEMENTS(fdsend));
diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
index 232984ace6..145eb17c08 100644
--- a/util/vhost-user-server.c
+++ b/util/vhost-user-server.c
@@ -116,7 +116,7 @@  vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
          * qio_channel_readv_full may have short reads, keeping calling it
          * until getting VHOST_USER_HDR_SIZE or 0 bytes in total
          */
-        rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err);
+        rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, 0, &local_err);
         if (rc < 0) {
             if (rc == QIO_CHANNEL_ERR_BLOCK) {
                 assert(local_err == NULL);