diff mbox series

[PULL,v2,31/82] vhost: Change the sequence of device start

Message ID 20221102160336.616599-32-mst@redhat.com (mailing list archive)
State New, archived
Headers show
Series [PULL,v2,01/82] hw/i386/e820: remove legacy reserved entries for e820 | expand

Commit Message

Michael S. Tsirkin Nov. 2, 2022, 4:09 p.m. UTC
From: Yajun Wu <yajunw@nvidia.com>

This patch is part of adding vhost-user vhost_dev_start support. The
motivation is to improve backend configuration speed and reduce live
migration VM downtime.

Moving the device start routines after finishing all the necessary device
and VQ configuration, further aligning to the virtio specification for
"device initialization sequence".

Following patch will add vhost-user vhost_dev_start support.

Signed-off-by: Yajun Wu <yajunw@nvidia.com>
Acked-by: Parav Pandit <parav@nvidia.com>

Message-Id: <20221017064452.1226514-2-yajunw@nvidia.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/block/vhost-user-blk.c | 18 +++++++++++-------
 hw/net/vhost_net.c        | 12 ++++++------
 2 files changed, 17 insertions(+), 13 deletions(-)

Comments

Bernhard Beschow Nov. 5, 2022, 4:35 p.m. UTC | #1
On Wed, Nov 2, 2022 at 5:24 PM Michael S. Tsirkin <mst@redhat.com> wrote:

> From: Yajun Wu <yajunw@nvidia.com>
>
> This patch is part of adding vhost-user vhost_dev_start support. The
> motivation is to improve backend configuration speed and reduce live
> migration VM downtime.
>
> Moving the device start routines after finishing all the necessary device
> and VQ configuration, further aligning to the virtio specification for
> "device initialization sequence".
>
> Following patch will add vhost-user vhost_dev_start support.
>
> Signed-off-by: Yajun Wu <yajunw@nvidia.com>
> Acked-by: Parav Pandit <parav@nvidia.com>
>
> Message-Id: <20221017064452.1226514-2-yajunw@nvidia.com>
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  hw/block/vhost-user-blk.c | 18 +++++++++++-------
>  hw/net/vhost_net.c        | 12 ++++++------
>  2 files changed, 17 insertions(+), 13 deletions(-)
>

A git bisect tells me that this is the first bad commit for failing
qos-tests which only fail when parallel jobs are enabled, e.g. `make
check-qtest -j8`:

Summary of Failures:

 76/541 qemu:qtest+qtest-aarch64 / qtest-aarch64/qos-test
      ERROR          18.68s   killed by signal 6 SIGABRT
 77/541 qemu:qtest+qtest-arm / qtest-arm/qos-test
      ERROR          17.60s   killed by signal 6 SIGABRT
 93/541 qemu:qtest+qtest-i386 / qtest-i386/qos-test
      ERROR          18.98s   killed by signal 6 SIGABRT
108/541 qemu:qtest+qtest-ppc64 / qtest-ppc64/qos-test
      ERROR          16.40s   killed by signal 6 SIGABRT
112/541 qemu:qtest+qtest-i386 / qtest-i386/bios-tables-test
      ERROR          145.94s   killed by signal 6 SIGABRT
130/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/qos-test
      ERROR          17.32s   killed by signal 6 SIGABRT
243/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/bios-tables-test
      ERROR          127.70s   killed by signal 6 SIGABRT

Ok:                 500
Expected Fail:      0
Fail:               7
Unexpected Pass:    0
Skipped:            34
Timeout:            0

Can anyone else reproduce this?

Here is a log of one failed case:

# child process
(/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
[384604]) killed by signal 6 (Aborted), core dumped
# child process
(/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
[384604]) stdout: ""
# child process
(/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
[384604]) stderr: "qemu-system-aarch64: -chardev
socket,id=chr-flags-mismatch,path=/tmp/vhost-test-BBEAV1/flags-mismatch.sock,server=on:
info: QEMU waiting for connection on:
disconnected:unix:/tmp/vhost-test-BBEAV1/flags-mismatch.sock,server=on\nqemu-system-aarch64:
Failed to write msg. Wrote -1 instead of 52.\nqemu-system-aarch64:
vhost_set_mem_table failed: Invalid argument (22)\nqemu-system-aarch64:
Failed to set msg fds.\nqemu-system-aarch64: vhost VQ 0 ring restore
failed: -22: Invalid argument (22)\nBroken
pipe\n../src/tests/qtest/libqtest.c:188: kill_qemu() detected QEMU death
from signal 11 (Segmentation fault) (core dumped)\n"
Bail out! ERROR:../src/tests/qtest/qos-test.c:191:subprocess_run_one_test:
child process
(/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
[384604]) failed unexpectedly
----------------------------------- stderr
-----------------------------------
qemu-system-aarch64: Failed to write msg. Wrote -1 instead of 20.
qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument
(22)
qemu-system-aarch64: Failed to set msg fds.
qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument
(22)
qemu-system-aarch64: -chardev
socket,id=chr-reconnect,path=/tmp/vhost-test-FTJ0U1/reconnect.sock,server=on:
info: QEMU waiting for connection on:
disconnected:unix:/tmp/vhost-test-FTJ0U1/reconnect.sock,server=on
qemu-system-aarch64: Failed to write msg. Wrote -1 instead of 20.
qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument
(22)
qemu-system-aarch64: Failed to set msg fds.
qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument
(22)
qemu-system-aarch64: -chardev
socket,id=chr-connect-fail,path=/tmp/vhost-test-4W4DV1/connect-fail.sock,server=on:
info: QEMU waiting for connection on:
disconnected:unix:/tmp/vhost-test-4W4DV1/connect-fail.sock,server=on
qemu-system-aarch64: -netdev
vhost-user,id=hs0,chardev=chr-connect-fail,vhostforce=on: Failed to read
msg header. Read 0 instead of 12. Original request 1.
qemu-system-aarch64: -netdev
vhost-user,id=hs0,chardev=chr-connect-fail,vhostforce=on:
vhost_backend_init failed: Protocol error
qemu-system-aarch64: -netdev
vhost-user,id=hs0,chardev=chr-connect-fail,vhostforce=on: failed to init
vhost_net for queue 0
qemu-system-aarch64: -netdev
vhost-user,id=hs0,chardev=chr-connect-fail,vhostforce=on: info: QEMU
waiting for connection on:
disconnected:unix:/tmp/vhost-test-4W4DV1/connect-fail.sock,server=on
qemu-system-aarch64: Failed to set msg fds.
qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument
(22)
qemu-system-aarch64: Failed to set msg fds.
qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument
(22)
qemu-system-aarch64: -chardev
socket,id=chr-flags-mismatch,path=/tmp/vhost-test-BBEAV1/flags-mismatch.sock,server=on:
info: QEMU waiting for connection on:
disconnected:unix:/tmp/vhost-test-BBEAV1/flags-mismatch.sock,server=on
qemu-system-aarch64: Failed to write msg. Wrote -1 instead of 52.
qemu-system-aarch64: vhost_set_mem_table failed: Invalid argument (22)
qemu-system-aarch64: Failed to set msg fds.
qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument
(22)
Broken pipe
../src/tests/qtest/libqtest.c:188: kill_qemu() detected QEMU death from
signal 11 (Segmentation fault) (core dumped)
**
ERROR:../src/tests/qtest/qos-test.c:191:subprocess_run_one_test: child
process
(/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
[384604]) failed unexpectedly

(test program exited with status code -6)
==============================================================================

=================================== 77/541
===================================
test:         qemu:qtest+qtest-arm / qtest-arm/qos-test
start time:   16:04:10
duration:     17.60s
result:       killed by signal 6 SIGABRT
command:      MALLOC_PERTURB_=98 QTEST_QEMU_IMG=./qemu-img
QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon
QTEST_QEMU_BINARY=./qemu-system-arm
G_TEST_DBUS_DAEMON=../src/tests/dbus-vmstate-daemon.sh
./tests/qtest/qos-test --tap -k

Best regards,
Bernhard
Michael S. Tsirkin Nov. 5, 2022, 4:43 p.m. UTC | #2
On Sat, Nov 05, 2022 at 05:35:57PM +0100, Bernhard Beschow wrote:
> 
> 
> On Wed, Nov 2, 2022 at 5:24 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> 
>     From: Yajun Wu <yajunw@nvidia.com>
> 
>     This patch is part of adding vhost-user vhost_dev_start support. The
>     motivation is to improve backend configuration speed and reduce live
>     migration VM downtime.
> 
>     Moving the device start routines after finishing all the necessary device
>     and VQ configuration, further aligning to the virtio specification for
>     "device initialization sequence".
> 
>     Following patch will add vhost-user vhost_dev_start support.
> 
>     Signed-off-by: Yajun Wu <yajunw@nvidia.com>
>     Acked-by: Parav Pandit <parav@nvidia.com>
> 
>     Message-Id: <20221017064452.1226514-2-yajunw@nvidia.com>
>     Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
>     Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>     ---
>      hw/block/vhost-user-blk.c | 18 +++++++++++-------
>      hw/net/vhost_net.c        | 12 ++++++------
>      2 files changed, 17 insertions(+), 13 deletions(-)
> 
> 
> A git bisect tells me that this is the first bad commit for failing qos-tests
> which only fail when parallel jobs are enabled, e.g. `make check-qtest -j8`:
> 
> Summary of Failures:
> 
>  76/541 qemu:qtest+qtest-aarch64 / qtest-aarch64/qos-test                      
>   ERROR          18.68s   killed by signal 6 SIGABRT
>  77/541 qemu:qtest+qtest-arm / qtest-arm/qos-test                              
>   ERROR          17.60s   killed by signal 6 SIGABRT
>  93/541 qemu:qtest+qtest-i386 / qtest-i386/qos-test                            
>   ERROR          18.98s   killed by signal 6 SIGABRT
> 108/541 qemu:qtest+qtest-ppc64 / qtest-ppc64/qos-test                          
>   ERROR          16.40s   killed by signal 6 SIGABRT
> 112/541 qemu:qtest+qtest-i386 / qtest-i386/bios-tables-test                    
>   ERROR          145.94s   killed by signal 6 SIGABRT
> 130/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/qos-test                        
>   ERROR          17.32s   killed by signal 6 SIGABRT
> 243/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/bios-tables-test                
>   ERROR          127.70s   killed by signal 6 SIGABRT
> 
> Ok:                 500
> Expected Fail:      0  
> Fail:               7  
> Unexpected Pass:    0  
> Skipped:            34  
> Timeout:            0  
> 
> Can anyone else reproduce this?

Could you pls try latest for_upstream in my tree?
That should have this fixed.

Thanks!


> Here is a log of one failed case:
> 
> # child process (/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/
> virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
> [384604]) killed by signal 6 (Aborted), core dumped
> # child process (/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/
> virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
> [384604]) stdout: ""
> # child process (/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/
> virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
> [384604]) stderr: "qemu-system-aarch64: -chardev socket,id=
> chr-flags-mismatch,path=/tmp/vhost-test-BBEAV1/flags-mismatch.sock,server=on:
> info: QEMU waiting for connection on: disconnected:unix:/tmp/vhost-test-BBEAV1/
> flags-mismatch.sock,server=on\nqemu-system-aarch64: Failed to write msg. Wrote
> -1 instead of 52.\nqemu-system-aarch64: vhost_set_mem_table failed: Invalid
> argument (22)\nqemu-system-aarch64: Failed to set msg fds.\
> nqemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument
> (22)\nBroken pipe\n../src/tests/qtest/libqtest.c:188: kill_qemu() detected QEMU
> death from signal 11 (Segmentation fault) (core dumped)\n"
> Bail out! ERROR:../src/tests/qtest/qos-test.c:191:subprocess_run_one_test:
> child process (/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/
> virtio-net-pci/virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess
> [384604]) failed unexpectedly
> ----------------------------------- stderr -----------------------------------
> qemu-system-aarch64: Failed to write msg. Wrote -1 instead of 20.
> qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument (22)
> qemu-system-aarch64: -chardev socket,id=chr-reconnect,path=/tmp/
> vhost-test-FTJ0U1/reconnect.sock,server=on: info: QEMU waiting for connection
> on: disconnected:unix:/tmp/vhost-test-FTJ0U1/reconnect.sock,server=on
> qemu-system-aarch64: Failed to write msg. Wrote -1 instead of 20.
> qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument (22)
> qemu-system-aarch64: -chardev socket,id=chr-connect-fail,path=/tmp/
> vhost-test-4W4DV1/connect-fail.sock,server=on: info: QEMU waiting for
> connection on: disconnected:unix:/tmp/vhost-test-4W4DV1/
> connect-fail.sock,server=on
> qemu-system-aarch64: -netdev vhost-user,id=hs0,chardev=
> chr-connect-fail,vhostforce=on: Failed to read msg header. Read 0 instead of
> 12. Original request 1.
> qemu-system-aarch64: -netdev vhost-user,id=hs0,chardev=
> chr-connect-fail,vhostforce=on: vhost_backend_init failed: Protocol error
> qemu-system-aarch64: -netdev vhost-user,id=hs0,chardev=
> chr-connect-fail,vhostforce=on: failed to init vhost_net for queue 0
> qemu-system-aarch64: -netdev vhost-user,id=hs0,chardev=
> chr-connect-fail,vhostforce=on: info: QEMU waiting for connection on:
> disconnected:unix:/tmp/vhost-test-4W4DV1/connect-fail.sock,server=on
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument (22)
> qemu-system-aarch64: -chardev socket,id=chr-flags-mismatch,path=/tmp/
> vhost-test-BBEAV1/flags-mismatch.sock,server=on: info: QEMU waiting for
> connection on: disconnected:unix:/tmp/vhost-test-BBEAV1/
> flags-mismatch.sock,server=on
> qemu-system-aarch64: Failed to write msg. Wrote -1 instead of 52.
> qemu-system-aarch64: vhost_set_mem_table failed: Invalid argument (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument (22)
> Broken pipe
> ../src/tests/qtest/libqtest.c:188: kill_qemu() detected QEMU death from signal
> 11 (Segmentation fault) (core dumped)
> **
> ERROR:../src/tests/qtest/qos-test.c:191:subprocess_run_one_test: child process
> (/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/virtio-net-pci/
> virtio-net/virtio-net-tests/vhost-user/flags-mismatch/subprocess [384604])
> failed unexpectedly
> 
> (test program exited with status code -6)
> ==============================================================================
> 
> =================================== 77/541 ===================================
> test:         qemu:qtest+qtest-arm / qtest-arm/qos-test
> start time:   16:04:10
> duration:     17.60s
> result:       killed by signal 6 SIGABRT
> command:      MALLOC_PERTURB_=98 QTEST_QEMU_IMG=./qemu-img
> QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon
> QTEST_QEMU_BINARY=./qemu-system-arm G_TEST_DBUS_DAEMON=../src/tests/
> dbus-vmstate-daemon.sh ./tests/qtest/qos-test --tap -k
> 
> Best regards,
> Bernhard
Christian A. Ehrhardt Nov. 6, 2022, 6 p.m. UTC | #3
Hi,

On Sat, Nov 05, 2022 at 12:43:05PM -0400, Michael S. Tsirkin wrote:
> On Sat, Nov 05, 2022 at 05:35:57PM +0100, Bernhard Beschow wrote:
> > 
> > 
> > On Wed, Nov 2, 2022 at 5:24 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > 
> >     From: Yajun Wu <yajunw@nvidia.com>
> > 
> >     This patch is part of adding vhost-user vhost_dev_start support. The
> >     motivation is to improve backend configuration speed and reduce live
> >     migration VM downtime.
> > 
> >     Moving the device start routines after finishing all the necessary device
> >     and VQ configuration, further aligning to the virtio specification for
> >     "device initialization sequence".
> > 
> >     Following patch will add vhost-user vhost_dev_start support.
> > 
> >     Signed-off-by: Yajun Wu <yajunw@nvidia.com>
> >     Acked-by: Parav Pandit <parav@nvidia.com>
> > 
> >     Message-Id: <20221017064452.1226514-2-yajunw@nvidia.com>
> >     Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> >     Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >     ---
> >      hw/block/vhost-user-blk.c | 18 +++++++++++-------
> >      hw/net/vhost_net.c        | 12 ++++++------
> >      2 files changed, 17 insertions(+), 13 deletions(-)
> > 
> > 
> > A git bisect tells me that this is the first bad commit for failing qos-tests
> > which only fail when parallel jobs are enabled, e.g. `make check-qtest -j8`:

Parallel test run is not required provided that the test machine is
sufficiently busy (load > number of CPU threads). In this case a single
invocation of the qos test will fail reliably with this change.

However, the change is not really the root cause of the failures.

> > Summary of Failures:
> > 
> >  76/541 qemu:qtest+qtest-aarch64 / qtest-aarch64/qos-test                      
> >   ERROR          18.68s   killed by signal 6 SIGABRT
> >  77/541 qemu:qtest+qtest-arm / qtest-arm/qos-test                              
> >   ERROR          17.60s   killed by signal 6 SIGABRT
> >  93/541 qemu:qtest+qtest-i386 / qtest-i386/qos-test                            
> >   ERROR          18.98s   killed by signal 6 SIGABRT
> > 108/541 qemu:qtest+qtest-ppc64 / qtest-ppc64/qos-test                          
> >   ERROR          16.40s   killed by signal 6 SIGABRT
> > 112/541 qemu:qtest+qtest-i386 / qtest-i386/bios-tables-test                    
> >   ERROR          145.94s   killed by signal 6 SIGABRT
> > 130/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/qos-test                        
> >   ERROR          17.32s   killed by signal 6 SIGABRT
> > 243/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/bios-tables-test                
> >   ERROR          127.70s   killed by signal 6 SIGABRT
> > 
> > Ok:                 500
> > Expected Fail:      0  
> > Fail:               7  
> > Unexpected Pass:    0  
> > Skipped:            34  
> > Timeout:            0  
> > 
> > Can anyone else reproduce this?
> 
> Could you pls try latest for_upstream in my tree?
> That should have this fixed.

Your new pull request simply drops this change and this does fix
make check-qtest. However, this looks accidental to me and the real
bug is there in plain origin/master, too.

What happens is this backtrace a recursive call to vu_gpio_stop
via the backtrace below. It is caused by a delayed of the TCP
connection (the delayed part only triggers with heavy load on the
machine).

You can get the failure back (probably in upstream) if the test is
forced to us "use-started=off" which can be set on the command line.
E.g. like this:

diff --git a/tests/qtest/libqos/virtio-gpio.c b/tests/qtest/libqos/virtio-gpio.c
index 762aa6695b..17c6b71e8b 100644
--- a/tests/qtest/libqos/virtio-gpio.c
+++ b/tests/qtest/libqos/virtio-gpio.c
@@ -154,14 +154,14 @@ static void virtio_gpio_register_nodes(void)
     QOSGraphEdgeOptions edge_opts = { };

     /* vhost-user-gpio-device */
-    edge_opts.extra_device_opts = "id=gpio0,chardev=chr-vhost-user-test";
+    edge_opts.extra_device_opts = "id=gpio0,chardev=chr-vhost-user-test,use-started=off";
     qos_node_create_driver("vhost-user-gpio-device",
                             virtio_gpio_device_create);
     qos_node_consumes("vhost-user-gpio-device", "virtio-bus", &edge_opts);
     qos_node_produces("vhost-user-gpio-device", "vhost-user-gpio");

     /* virtio-gpio-pci */
-    edge_opts.extra_device_opts = "id=gpio0,addr=04.0,chardev=chr-vhost-user-test";
+    edge_opts.extra_device_opts = "id=gpio0,addr=04.0,chardev=chr-vhost-user-test,use-started=on";
     add_qpci_address(&edge_opts, &addr);
     qos_node_create_driver("vhost-user-gpio-pci", virtio_gpio_pci_create);
     qos_node_consumes("vhost-user-gpio-pci", "pci-bus", &edge_opts);


I haven't verified this but from looking at the code other types of
vhost devices seem to have the same problem (e.g. vhost-user-i2c looks
suspicious).

Ok, here's the backtrace:

#0  vu_gpio_stop (vdev=vdev@entry=0x560e0ae449d0) at ../hw/virtio/vhost-user-gpio.c:143
#1  0x0000560e0768fb1f in vu_gpio_disconnect (dev=<optimized out>) at ../hw/virtio/vhost-user-gpio.c:260
#2  vu_gpio_event (opaque=<optimized out>, event=<optimized out>) at ../hw/virtio/vhost-user-gpio.c:279
#3  0x0000560e078057b5 in tcp_chr_disconnect_locked (chr=0x560e0a00f800) at ../chardev/char-socket.c:470
#4  0x0000560e078058d3 in tcp_chr_write (chr=0x560e0a00f800, buf=<optimized out>, len=<optimized out>) at ../chardev/char-socket.c:129
#5  0x0000560e07808a63 in qemu_chr_write_buffer (s=s@entry=0x560e0a00f800, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=20, offset=offset@entry=0x7ffc76812a40, write_all=write_all@entry=true) at ../chardev/char.c:121
#6  0x0000560e07808e84 in qemu_chr_write (s=0x560e0a00f800, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=len@entry=20, write_all=write_all@entry=true) at ../chardev/char.c:173
#7  0x0000560e078010e6 in qemu_chr_fe_write_all (be=be@entry=0x560e0ae44bc0, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=len@entry=20) at ../chardev/char-fe.c:53
#8  0x0000560e07676d16 in vhost_user_write (msg=msg@entry=0x7ffc76812ac0, fds=fds@entry=0x0, fd_num=fd_num@entry=0, dev=<optimized out>, dev=<optimized out>) at ../hw/virtio/vhost-user.c:490
#9  0x0000560e076777c0 in vhost_user_get_vring_base (dev=0x560e0ae44c08, ring=0x7ffc76812d80) at ../hw/virtio/vhost-user.c:1260
#10 0x0000560e0767314e in vhost_virtqueue_stop (dev=dev@entry=0x560e0ae44c08, vdev=vdev@entry=0x560e0ae449d0, vq=0x560e0ae7a570, idx=0) at ../hw/virtio/vhost.c:1220
#11 0x0000560e07675236 in vhost_dev_stop (hdev=hdev@entry=0x560e0ae44c08, vdev=vdev@entry=0x560e0ae449d0) at ../hw/virtio/vhost.c:1884
#12 0x0000560e0768f2fd in vu_gpio_stop (vdev=0x560e0ae449d0) at ../hw/virtio/vhost-user-gpio.c:148
#13 0x0000560e0766a6a3 in virtio_set_status (vdev=0x560e0ae449d0, val=<optimized out>) at ../hw/virtio/virtio.c:2442
#14 0x0000560e0751a2e0 in vm_state_notify (running=running@entry=false, state=state@entry=RUN_STATE_SHUTDOWN) at ../softmmu/runstate.c:334
#15 0x0000560e07514d70 in do_vm_stop (send_stop=false, state=RUN_STATE_SHUTDOWN) at ../softmmu/cpus.c:262
#16 vm_shutdown () at ../softmmu/cpus.c:280
#17 0x0000560e0751af63 in qemu_cleanup () at ../softmmu/runstate.c:827
#18 0x0000560e07350f33 in qemu_default_main () at ../softmmu/main.c:38
#19 0x00007f0a5655bd90 in __libc_start_call_main (main=main@entry=0x560e0734c3e0 <main>, argc=argc@entry=25, argv=argv@entry=0x7ffc768130a8) at ../sysdeps/nptl/libc_start_call_main.h:58
#20 0x00007f0a5655be40 in __libc_start_main_impl (main=0x560e0734c3e0 <main>, argc=25, argv=0x7ffc768130a8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffc76813098) at ../csu/libc-start.c:392
#21 0x0000560e07350e55 in _start ()


Any Ideas how to fix this properly?

    regards    Christian
Michael S. Tsirkin Nov. 7, 2022, 1:30 p.m. UTC | #4
On Sun, Nov 06, 2022 at 07:00:33PM +0100, Christian A. Ehrhardt wrote:
> 
> Hi,
> 
> On Sat, Nov 05, 2022 at 12:43:05PM -0400, Michael S. Tsirkin wrote:
> > On Sat, Nov 05, 2022 at 05:35:57PM +0100, Bernhard Beschow wrote:
> > > 
> > > 
> > > On Wed, Nov 2, 2022 at 5:24 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > 
> > >     From: Yajun Wu <yajunw@nvidia.com>
> > > 
> > >     This patch is part of adding vhost-user vhost_dev_start support. The
> > >     motivation is to improve backend configuration speed and reduce live
> > >     migration VM downtime.
> > > 
> > >     Moving the device start routines after finishing all the necessary device
> > >     and VQ configuration, further aligning to the virtio specification for
> > >     "device initialization sequence".
> > > 
> > >     Following patch will add vhost-user vhost_dev_start support.
> > > 
> > >     Signed-off-by: Yajun Wu <yajunw@nvidia.com>
> > >     Acked-by: Parav Pandit <parav@nvidia.com>
> > > 
> > >     Message-Id: <20221017064452.1226514-2-yajunw@nvidia.com>
> > >     Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> > >     Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > >     ---
> > >      hw/block/vhost-user-blk.c | 18 +++++++++++-------
> > >      hw/net/vhost_net.c        | 12 ++++++------
> > >      2 files changed, 17 insertions(+), 13 deletions(-)
> > > 
> > > 
> > > A git bisect tells me that this is the first bad commit for failing qos-tests
> > > which only fail when parallel jobs are enabled, e.g. `make check-qtest -j8`:
> 
> Parallel test run is not required provided that the test machine is
> sufficiently busy (load > number of CPU threads). In this case a single
> invocation of the qos test will fail reliably with this change.
> 
> However, the change is not really the root cause of the failures.
> 
> > > Summary of Failures:
> > > 
> > >  76/541 qemu:qtest+qtest-aarch64 / qtest-aarch64/qos-test                      
> > >   ERROR          18.68s   killed by signal 6 SIGABRT
> > >  77/541 qemu:qtest+qtest-arm / qtest-arm/qos-test                              
> > >   ERROR          17.60s   killed by signal 6 SIGABRT
> > >  93/541 qemu:qtest+qtest-i386 / qtest-i386/qos-test                            
> > >   ERROR          18.98s   killed by signal 6 SIGABRT
> > > 108/541 qemu:qtest+qtest-ppc64 / qtest-ppc64/qos-test                          
> > >   ERROR          16.40s   killed by signal 6 SIGABRT
> > > 112/541 qemu:qtest+qtest-i386 / qtest-i386/bios-tables-test                    
> > >   ERROR          145.94s   killed by signal 6 SIGABRT
> > > 130/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/qos-test                        
> > >   ERROR          17.32s   killed by signal 6 SIGABRT
> > > 243/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/bios-tables-test                
> > >   ERROR          127.70s   killed by signal 6 SIGABRT
> > > 
> > > Ok:                 500
> > > Expected Fail:      0  
> > > Fail:               7  
> > > Unexpected Pass:    0  
> > > Skipped:            34  
> > > Timeout:            0  
> > > 
> > > Can anyone else reproduce this?
> > 
> > Could you pls try latest for_upstream in my tree?
> > That should have this fixed.
> 
> Your new pull request simply drops this change and this does fix
> make check-qtest. However, this looks accidental to me and the real
> bug is there in plain origin/master, too.
> 
> What happens is this backtrace a recursive call to vu_gpio_stop
> via the backtrace below. It is caused by a delayed of the TCP
> connection (the delayed part only triggers with heavy load on the
> machine).
> 
> You can get the failure back (probably in upstream) if the test is
> forced to us "use-started=off" which can be set on the command line.
> E.g. like this:
> 
> diff --git a/tests/qtest/libqos/virtio-gpio.c b/tests/qtest/libqos/virtio-gpio.c
> index 762aa6695b..17c6b71e8b 100644
> --- a/tests/qtest/libqos/virtio-gpio.c
> +++ b/tests/qtest/libqos/virtio-gpio.c
> @@ -154,14 +154,14 @@ static void virtio_gpio_register_nodes(void)
>      QOSGraphEdgeOptions edge_opts = { };
> 
>      /* vhost-user-gpio-device */
> -    edge_opts.extra_device_opts = "id=gpio0,chardev=chr-vhost-user-test";
> +    edge_opts.extra_device_opts = "id=gpio0,chardev=chr-vhost-user-test,use-started=off";
>      qos_node_create_driver("vhost-user-gpio-device",
>                              virtio_gpio_device_create);
>      qos_node_consumes("vhost-user-gpio-device", "virtio-bus", &edge_opts);
>      qos_node_produces("vhost-user-gpio-device", "vhost-user-gpio");
> 
>      /* virtio-gpio-pci */
> -    edge_opts.extra_device_opts = "id=gpio0,addr=04.0,chardev=chr-vhost-user-test";
> +    edge_opts.extra_device_opts = "id=gpio0,addr=04.0,chardev=chr-vhost-user-test,use-started=on";
>      add_qpci_address(&edge_opts, &addr);
>      qos_node_create_driver("vhost-user-gpio-pci", virtio_gpio_pci_create);
>      qos_node_consumes("vhost-user-gpio-pci", "pci-bus", &edge_opts);
> 
> 
> I haven't verified this but from looking at the code other types of
> vhost devices seem to have the same problem (e.g. vhost-user-i2c looks
> suspicious).
> 
> Ok, here's the backtrace:
> 
> #0  vu_gpio_stop (vdev=vdev@entry=0x560e0ae449d0) at ../hw/virtio/vhost-user-gpio.c:143
> #1  0x0000560e0768fb1f in vu_gpio_disconnect (dev=<optimized out>) at ../hw/virtio/vhost-user-gpio.c:260
> #2  vu_gpio_event (opaque=<optimized out>, event=<optimized out>) at ../hw/virtio/vhost-user-gpio.c:279
> #3  0x0000560e078057b5 in tcp_chr_disconnect_locked (chr=0x560e0a00f800) at ../chardev/char-socket.c:470
> #4  0x0000560e078058d3 in tcp_chr_write (chr=0x560e0a00f800, buf=<optimized out>, len=<optimized out>) at ../chardev/char-socket.c:129
> #5  0x0000560e07808a63 in qemu_chr_write_buffer (s=s@entry=0x560e0a00f800, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=20, offset=offset@entry=0x7ffc76812a40, write_all=write_all@entry=true) at ../chardev/char.c:121
> #6  0x0000560e07808e84 in qemu_chr_write (s=0x560e0a00f800, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=len@entry=20, write_all=write_all@entry=true) at ../chardev/char.c:173
> #7  0x0000560e078010e6 in qemu_chr_fe_write_all (be=be@entry=0x560e0ae44bc0, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=len@entry=20) at ../chardev/char-fe.c:53
> #8  0x0000560e07676d16 in vhost_user_write (msg=msg@entry=0x7ffc76812ac0, fds=fds@entry=0x0, fd_num=fd_num@entry=0, dev=<optimized out>, dev=<optimized out>) at ../hw/virtio/vhost-user.c:490
> #9  0x0000560e076777c0 in vhost_user_get_vring_base (dev=0x560e0ae44c08, ring=0x7ffc76812d80) at ../hw/virtio/vhost-user.c:1260
> #10 0x0000560e0767314e in vhost_virtqueue_stop (dev=dev@entry=0x560e0ae44c08, vdev=vdev@entry=0x560e0ae449d0, vq=0x560e0ae7a570, idx=0) at ../hw/virtio/vhost.c:1220
> #11 0x0000560e07675236 in vhost_dev_stop (hdev=hdev@entry=0x560e0ae44c08, vdev=vdev@entry=0x560e0ae449d0) at ../hw/virtio/vhost.c:1884
> #12 0x0000560e0768f2fd in vu_gpio_stop (vdev=0x560e0ae449d0) at ../hw/virtio/vhost-user-gpio.c:148
> #13 0x0000560e0766a6a3 in virtio_set_status (vdev=0x560e0ae449d0, val=<optimized out>) at ../hw/virtio/virtio.c:2442
> #14 0x0000560e0751a2e0 in vm_state_notify (running=running@entry=false, state=state@entry=RUN_STATE_SHUTDOWN) at ../softmmu/runstate.c:334
> #15 0x0000560e07514d70 in do_vm_stop (send_stop=false, state=RUN_STATE_SHUTDOWN) at ../softmmu/cpus.c:262
> #16 vm_shutdown () at ../softmmu/cpus.c:280
> #17 0x0000560e0751af63 in qemu_cleanup () at ../softmmu/runstate.c:827
> #18 0x0000560e07350f33 in qemu_default_main () at ../softmmu/main.c:38
> #19 0x00007f0a5655bd90 in __libc_start_call_main (main=main@entry=0x560e0734c3e0 <main>, argc=argc@entry=25, argv=argv@entry=0x7ffc768130a8) at ../sysdeps/nptl/libc_start_call_main.h:58
> #20 0x00007f0a5655be40 in __libc_start_main_impl (main=0x560e0734c3e0 <main>, argc=25, argv=0x7ffc768130a8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffc76813098) at ../csu/libc-start.c:392
> #21 0x0000560e07350e55 in _start ()
> 
> 
> Any Ideas how to fix this properly?
> 
>     regards    Christian

so fundamentally, any write into a socket can trigger stop?
I don't think we realized this.
Hmm this needs some thought there are likely more issues
like this.
Christian A. Ehrhardt Nov. 7, 2022, 5:31 p.m. UTC | #5
On Mon, Nov 07, 2022 at 08:30:32AM -0500, Michael S. Tsirkin wrote:
> On Sun, Nov 06, 2022 at 07:00:33PM +0100, Christian A. Ehrhardt wrote:
> > 
> > Hi,
> > 
> > On Sat, Nov 05, 2022 at 12:43:05PM -0400, Michael S. Tsirkin wrote:
> > > On Sat, Nov 05, 2022 at 05:35:57PM +0100, Bernhard Beschow wrote:
> > > > 
> > > > 
> > > > On Wed, Nov 2, 2022 at 5:24 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > 
> > > >     From: Yajun Wu <yajunw@nvidia.com>
> > > > 
> > > >     This patch is part of adding vhost-user vhost_dev_start support. The
> > > >     motivation is to improve backend configuration speed and reduce live
> > > >     migration VM downtime.
> > > > 
> > > >     Moving the device start routines after finishing all the necessary device
> > > >     and VQ configuration, further aligning to the virtio specification for
> > > >     "device initialization sequence".
> > > > 
> > > >     Following patch will add vhost-user vhost_dev_start support.
> > > > 
> > > >     Signed-off-by: Yajun Wu <yajunw@nvidia.com>
> > > >     Acked-by: Parav Pandit <parav@nvidia.com>
> > > > 
> > > >     Message-Id: <20221017064452.1226514-2-yajunw@nvidia.com>
> > > >     Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> > > >     Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > >     ---
> > > >      hw/block/vhost-user-blk.c | 18 +++++++++++-------
> > > >      hw/net/vhost_net.c        | 12 ++++++------
> > > >      2 files changed, 17 insertions(+), 13 deletions(-)
> > > > 
> > > > 
> > > > A git bisect tells me that this is the first bad commit for failing qos-tests
> > > > which only fail when parallel jobs are enabled, e.g. `make check-qtest -j8`:
> > 
> > Parallel test run is not required provided that the test machine is
> > sufficiently busy (load > number of CPU threads). In this case a single
> > invocation of the qos test will fail reliably with this change.
> > 
> > However, the change is not really the root cause of the failures.
> > 
> > > > Summary of Failures:
> > > > 
> > > >  76/541 qemu:qtest+qtest-aarch64 / qtest-aarch64/qos-test                      
> > > >   ERROR          18.68s   killed by signal 6 SIGABRT
> > > >  77/541 qemu:qtest+qtest-arm / qtest-arm/qos-test                              
> > > >   ERROR          17.60s   killed by signal 6 SIGABRT
> > > >  93/541 qemu:qtest+qtest-i386 / qtest-i386/qos-test                            
> > > >   ERROR          18.98s   killed by signal 6 SIGABRT
> > > > 108/541 qemu:qtest+qtest-ppc64 / qtest-ppc64/qos-test                          
> > > >   ERROR          16.40s   killed by signal 6 SIGABRT
> > > > 112/541 qemu:qtest+qtest-i386 / qtest-i386/bios-tables-test                    
> > > >   ERROR          145.94s   killed by signal 6 SIGABRT
> > > > 130/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/qos-test                        
> > > >   ERROR          17.32s   killed by signal 6 SIGABRT
> > > > 243/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/bios-tables-test                
> > > >   ERROR          127.70s   killed by signal 6 SIGABRT
> > > > 
> > > > Ok:                 500
> > > > Expected Fail:      0  
> > > > Fail:               7  
> > > > Unexpected Pass:    0  
> > > > Skipped:            34  
> > > > Timeout:            0  
> > > > 
> > > > Can anyone else reproduce this?
> > > 
> > > Could you pls try latest for_upstream in my tree?
> > > That should have this fixed.
> > 
> > Your new pull request simply drops this change and this does fix
> > make check-qtest. However, this looks accidental to me and the real
> > bug is there in plain origin/master, too.
> > 
> > What happens is this backtrace a recursive call to vu_gpio_stop
> > via the backtrace below. It is caused by a delayed of the TCP
> > connection (the delayed part only triggers with heavy load on the
> > machine).
> > 
> > You can get the failure back (probably in upstream) if the test is
> > forced to us "use-started=off" which can be set on the command line.
> > E.g. like this:
> > 
> > diff --git a/tests/qtest/libqos/virtio-gpio.c b/tests/qtest/libqos/virtio-gpio.c
> > index 762aa6695b..17c6b71e8b 100644
> > --- a/tests/qtest/libqos/virtio-gpio.c
> > +++ b/tests/qtest/libqos/virtio-gpio.c
> > @@ -154,14 +154,14 @@ static void virtio_gpio_register_nodes(void)
> >      QOSGraphEdgeOptions edge_opts = { };
> > 
> >      /* vhost-user-gpio-device */
> > -    edge_opts.extra_device_opts = "id=gpio0,chardev=chr-vhost-user-test";
> > +    edge_opts.extra_device_opts = "id=gpio0,chardev=chr-vhost-user-test,use-started=off";
> >      qos_node_create_driver("vhost-user-gpio-device",
> >                              virtio_gpio_device_create);
> >      qos_node_consumes("vhost-user-gpio-device", "virtio-bus", &edge_opts);
> >      qos_node_produces("vhost-user-gpio-device", "vhost-user-gpio");
> > 
> >      /* virtio-gpio-pci */
> > -    edge_opts.extra_device_opts = "id=gpio0,addr=04.0,chardev=chr-vhost-user-test";
> > +    edge_opts.extra_device_opts = "id=gpio0,addr=04.0,chardev=chr-vhost-user-test,use-started=on";
> >      add_qpci_address(&edge_opts, &addr);
> >      qos_node_create_driver("vhost-user-gpio-pci", virtio_gpio_pci_create);
> >      qos_node_consumes("vhost-user-gpio-pci", "pci-bus", &edge_opts);
> > 
> > 
> > I haven't verified this but from looking at the code other types of
> > vhost devices seem to have the same problem (e.g. vhost-user-i2c looks
> > suspicious).
> > 
> > Ok, here's the backtrace:
> > 
> > #0  vu_gpio_stop (vdev=vdev@entry=0x560e0ae449d0) at ../hw/virtio/vhost-user-gpio.c:143
> > #1  0x0000560e0768fb1f in vu_gpio_disconnect (dev=<optimized out>) at ../hw/virtio/vhost-user-gpio.c:260
> > #2  vu_gpio_event (opaque=<optimized out>, event=<optimized out>) at ../hw/virtio/vhost-user-gpio.c:279
> > #3  0x0000560e078057b5 in tcp_chr_disconnect_locked (chr=0x560e0a00f800) at ../chardev/char-socket.c:470
> > #4  0x0000560e078058d3 in tcp_chr_write (chr=0x560e0a00f800, buf=<optimized out>, len=<optimized out>) at ../chardev/char-socket.c:129
> > #5  0x0000560e07808a63 in qemu_chr_write_buffer (s=s@entry=0x560e0a00f800, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=20, offset=offset@entry=0x7ffc76812a40, write_all=write_all@entry=true) at ../chardev/char.c:121
> > #6  0x0000560e07808e84 in qemu_chr_write (s=0x560e0a00f800, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=len@entry=20, write_all=write_all@entry=true) at ../chardev/char.c:173
> > #7  0x0000560e078010e6 in qemu_chr_fe_write_all (be=be@entry=0x560e0ae44bc0, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=len@entry=20) at ../chardev/char-fe.c:53
> > #8  0x0000560e07676d16 in vhost_user_write (msg=msg@entry=0x7ffc76812ac0, fds=fds@entry=0x0, fd_num=fd_num@entry=0, dev=<optimized out>, dev=<optimized out>) at ../hw/virtio/vhost-user.c:490
> > #9  0x0000560e076777c0 in vhost_user_get_vring_base (dev=0x560e0ae44c08, ring=0x7ffc76812d80) at ../hw/virtio/vhost-user.c:1260
> > #10 0x0000560e0767314e in vhost_virtqueue_stop (dev=dev@entry=0x560e0ae44c08, vdev=vdev@entry=0x560e0ae449d0, vq=0x560e0ae7a570, idx=0) at ../hw/virtio/vhost.c:1220
> > #11 0x0000560e07675236 in vhost_dev_stop (hdev=hdev@entry=0x560e0ae44c08, vdev=vdev@entry=0x560e0ae449d0) at ../hw/virtio/vhost.c:1884
> > #12 0x0000560e0768f2fd in vu_gpio_stop (vdev=0x560e0ae449d0) at ../hw/virtio/vhost-user-gpio.c:148
> > #13 0x0000560e0766a6a3 in virtio_set_status (vdev=0x560e0ae449d0, val=<optimized out>) at ../hw/virtio/virtio.c:2442
> > #14 0x0000560e0751a2e0 in vm_state_notify (running=running@entry=false, state=state@entry=RUN_STATE_SHUTDOWN) at ../softmmu/runstate.c:334
> > #15 0x0000560e07514d70 in do_vm_stop (send_stop=false, state=RUN_STATE_SHUTDOWN) at ../softmmu/cpus.c:262
> > #16 vm_shutdown () at ../softmmu/cpus.c:280
> > #17 0x0000560e0751af63 in qemu_cleanup () at ../softmmu/runstate.c:827
> > #18 0x0000560e07350f33 in qemu_default_main () at ../softmmu/main.c:38
> > #19 0x00007f0a5655bd90 in __libc_start_call_main (main=main@entry=0x560e0734c3e0 <main>, argc=argc@entry=25, argv=argv@entry=0x7ffc768130a8) at ../sysdeps/nptl/libc_start_call_main.h:58
> > #20 0x00007f0a5655be40 in __libc_start_main_impl (main=0x560e0734c3e0 <main>, argc=25, argv=0x7ffc768130a8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffc76813098) at ../csu/libc-start.c:392
> > #21 0x0000560e07350e55 in _start ()
> > 
> > 
> > Any Ideas how to fix this properly?
> > 
> >     regards    Christian
> 
> so fundamentally, any write into a socket can trigger stop?
> I don't think we realized this.
> Hmm this needs some thought there are likely more issues
> like this.

The code in net/vhost-user.c seems to realize this and uses
aio to schedule the close. I was wondering if chardev should
do this for OPEN/CLOSE events by default (or by request).
However, I couldn't figure out how to ensure that the device
remains alive until the callback is actually called.

      regards   Christian
Michael S. Tsirkin Nov. 7, 2022, 5:54 p.m. UTC | #6
On Mon, Nov 07, 2022 at 06:31:00PM +0100, Christian A. Ehrhardt wrote:
> On Mon, Nov 07, 2022 at 08:30:32AM -0500, Michael S. Tsirkin wrote:
> > On Sun, Nov 06, 2022 at 07:00:33PM +0100, Christian A. Ehrhardt wrote:
> > > 
> > > Hi,
> > > 
> > > On Sat, Nov 05, 2022 at 12:43:05PM -0400, Michael S. Tsirkin wrote:
> > > > On Sat, Nov 05, 2022 at 05:35:57PM +0100, Bernhard Beschow wrote:
> > > > > 
> > > > > 
> > > > > On Wed, Nov 2, 2022 at 5:24 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > 
> > > > >     From: Yajun Wu <yajunw@nvidia.com>
> > > > > 
> > > > >     This patch is part of adding vhost-user vhost_dev_start support. The
> > > > >     motivation is to improve backend configuration speed and reduce live
> > > > >     migration VM downtime.
> > > > > 
> > > > >     Moving the device start routines after finishing all the necessary device
> > > > >     and VQ configuration, further aligning to the virtio specification for
> > > > >     "device initialization sequence".
> > > > > 
> > > > >     Following patch will add vhost-user vhost_dev_start support.
> > > > > 
> > > > >     Signed-off-by: Yajun Wu <yajunw@nvidia.com>
> > > > >     Acked-by: Parav Pandit <parav@nvidia.com>
> > > > > 
> > > > >     Message-Id: <20221017064452.1226514-2-yajunw@nvidia.com>
> > > > >     Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> > > > >     Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > >     ---
> > > > >      hw/block/vhost-user-blk.c | 18 +++++++++++-------
> > > > >      hw/net/vhost_net.c        | 12 ++++++------
> > > > >      2 files changed, 17 insertions(+), 13 deletions(-)
> > > > > 
> > > > > 
> > > > > A git bisect tells me that this is the first bad commit for failing qos-tests
> > > > > which only fail when parallel jobs are enabled, e.g. `make check-qtest -j8`:
> > > 
> > > Parallel test run is not required provided that the test machine is
> > > sufficiently busy (load > number of CPU threads). In this case a single
> > > invocation of the qos test will fail reliably with this change.
> > > 
> > > However, the change is not really the root cause of the failures.
> > > 
> > > > > Summary of Failures:
> > > > > 
> > > > >  76/541 qemu:qtest+qtest-aarch64 / qtest-aarch64/qos-test                      
> > > > >   ERROR          18.68s   killed by signal 6 SIGABRT
> > > > >  77/541 qemu:qtest+qtest-arm / qtest-arm/qos-test                              
> > > > >   ERROR          17.60s   killed by signal 6 SIGABRT
> > > > >  93/541 qemu:qtest+qtest-i386 / qtest-i386/qos-test                            
> > > > >   ERROR          18.98s   killed by signal 6 SIGABRT
> > > > > 108/541 qemu:qtest+qtest-ppc64 / qtest-ppc64/qos-test                          
> > > > >   ERROR          16.40s   killed by signal 6 SIGABRT
> > > > > 112/541 qemu:qtest+qtest-i386 / qtest-i386/bios-tables-test                    
> > > > >   ERROR          145.94s   killed by signal 6 SIGABRT
> > > > > 130/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/qos-test                        
> > > > >   ERROR          17.32s   killed by signal 6 SIGABRT
> > > > > 243/541 qemu:qtest+qtest-x86_64 / qtest-x86_64/bios-tables-test                
> > > > >   ERROR          127.70s   killed by signal 6 SIGABRT
> > > > > 
> > > > > Ok:                 500
> > > > > Expected Fail:      0  
> > > > > Fail:               7  
> > > > > Unexpected Pass:    0  
> > > > > Skipped:            34  
> > > > > Timeout:            0  
> > > > > 
> > > > > Can anyone else reproduce this?
> > > > 
> > > > Could you pls try latest for_upstream in my tree?
> > > > That should have this fixed.
> > > 
> > > Your new pull request simply drops this change and this does fix
> > > make check-qtest. However, this looks accidental to me and the real
> > > bug is there in plain origin/master, too.
> > > 
> > > What happens is this backtrace a recursive call to vu_gpio_stop
> > > via the backtrace below. It is caused by a delayed of the TCP
> > > connection (the delayed part only triggers with heavy load on the
> > > machine).
> > > 
> > > You can get the failure back (probably in upstream) if the test is
> > > forced to us "use-started=off" which can be set on the command line.
> > > E.g. like this:
> > > 
> > > diff --git a/tests/qtest/libqos/virtio-gpio.c b/tests/qtest/libqos/virtio-gpio.c
> > > index 762aa6695b..17c6b71e8b 100644
> > > --- a/tests/qtest/libqos/virtio-gpio.c
> > > +++ b/tests/qtest/libqos/virtio-gpio.c
> > > @@ -154,14 +154,14 @@ static void virtio_gpio_register_nodes(void)
> > >      QOSGraphEdgeOptions edge_opts = { };
> > > 
> > >      /* vhost-user-gpio-device */
> > > -    edge_opts.extra_device_opts = "id=gpio0,chardev=chr-vhost-user-test";
> > > +    edge_opts.extra_device_opts = "id=gpio0,chardev=chr-vhost-user-test,use-started=off";
> > >      qos_node_create_driver("vhost-user-gpio-device",
> > >                              virtio_gpio_device_create);
> > >      qos_node_consumes("vhost-user-gpio-device", "virtio-bus", &edge_opts);
> > >      qos_node_produces("vhost-user-gpio-device", "vhost-user-gpio");
> > > 
> > >      /* virtio-gpio-pci */
> > > -    edge_opts.extra_device_opts = "id=gpio0,addr=04.0,chardev=chr-vhost-user-test";
> > > +    edge_opts.extra_device_opts = "id=gpio0,addr=04.0,chardev=chr-vhost-user-test,use-started=on";
> > >      add_qpci_address(&edge_opts, &addr);
> > >      qos_node_create_driver("vhost-user-gpio-pci", virtio_gpio_pci_create);
> > >      qos_node_consumes("vhost-user-gpio-pci", "pci-bus", &edge_opts);
> > > 
> > > 
> > > I haven't verified this but from looking at the code other types of
> > > vhost devices seem to have the same problem (e.g. vhost-user-i2c looks
> > > suspicious).
> > > 
> > > Ok, here's the backtrace:
> > > 
> > > #0  vu_gpio_stop (vdev=vdev@entry=0x560e0ae449d0) at ../hw/virtio/vhost-user-gpio.c:143
> > > #1  0x0000560e0768fb1f in vu_gpio_disconnect (dev=<optimized out>) at ../hw/virtio/vhost-user-gpio.c:260
> > > #2  vu_gpio_event (opaque=<optimized out>, event=<optimized out>) at ../hw/virtio/vhost-user-gpio.c:279
> > > #3  0x0000560e078057b5 in tcp_chr_disconnect_locked (chr=0x560e0a00f800) at ../chardev/char-socket.c:470
> > > #4  0x0000560e078058d3 in tcp_chr_write (chr=0x560e0a00f800, buf=<optimized out>, len=<optimized out>) at ../chardev/char-socket.c:129
> > > #5  0x0000560e07808a63 in qemu_chr_write_buffer (s=s@entry=0x560e0a00f800, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=20, offset=offset@entry=0x7ffc76812a40, write_all=write_all@entry=true) at ../chardev/char.c:121
> > > #6  0x0000560e07808e84 in qemu_chr_write (s=0x560e0a00f800, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=len@entry=20, write_all=write_all@entry=true) at ../chardev/char.c:173
> > > #7  0x0000560e078010e6 in qemu_chr_fe_write_all (be=be@entry=0x560e0ae44bc0, buf=buf@entry=0x7ffc76812ac0 <error: Cannot access memory at address 0x7ffc76812ac0>, len=len@entry=20) at ../chardev/char-fe.c:53
> > > #8  0x0000560e07676d16 in vhost_user_write (msg=msg@entry=0x7ffc76812ac0, fds=fds@entry=0x0, fd_num=fd_num@entry=0, dev=<optimized out>, dev=<optimized out>) at ../hw/virtio/vhost-user.c:490
> > > #9  0x0000560e076777c0 in vhost_user_get_vring_base (dev=0x560e0ae44c08, ring=0x7ffc76812d80) at ../hw/virtio/vhost-user.c:1260
> > > #10 0x0000560e0767314e in vhost_virtqueue_stop (dev=dev@entry=0x560e0ae44c08, vdev=vdev@entry=0x560e0ae449d0, vq=0x560e0ae7a570, idx=0) at ../hw/virtio/vhost.c:1220
> > > #11 0x0000560e07675236 in vhost_dev_stop (hdev=hdev@entry=0x560e0ae44c08, vdev=vdev@entry=0x560e0ae449d0) at ../hw/virtio/vhost.c:1884
> > > #12 0x0000560e0768f2fd in vu_gpio_stop (vdev=0x560e0ae449d0) at ../hw/virtio/vhost-user-gpio.c:148
> > > #13 0x0000560e0766a6a3 in virtio_set_status (vdev=0x560e0ae449d0, val=<optimized out>) at ../hw/virtio/virtio.c:2442
> > > #14 0x0000560e0751a2e0 in vm_state_notify (running=running@entry=false, state=state@entry=RUN_STATE_SHUTDOWN) at ../softmmu/runstate.c:334
> > > #15 0x0000560e07514d70 in do_vm_stop (send_stop=false, state=RUN_STATE_SHUTDOWN) at ../softmmu/cpus.c:262
> > > #16 vm_shutdown () at ../softmmu/cpus.c:280
> > > #17 0x0000560e0751af63 in qemu_cleanup () at ../softmmu/runstate.c:827
> > > #18 0x0000560e07350f33 in qemu_default_main () at ../softmmu/main.c:38
> > > #19 0x00007f0a5655bd90 in __libc_start_call_main (main=main@entry=0x560e0734c3e0 <main>, argc=argc@entry=25, argv=argv@entry=0x7ffc768130a8) at ../sysdeps/nptl/libc_start_call_main.h:58
> > > #20 0x00007f0a5655be40 in __libc_start_main_impl (main=0x560e0734c3e0 <main>, argc=25, argv=0x7ffc768130a8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffc76813098) at ../csu/libc-start.c:392
> > > #21 0x0000560e07350e55 in _start ()
> > > 
> > > 
> > > Any Ideas how to fix this properly?
> > > 
> > >     regards    Christian
> > 
> > so fundamentally, any write into a socket can trigger stop?
> > I don't think we realized this.
> > Hmm this needs some thought there are likely more issues
> > like this.
> 
> The code in net/vhost-user.c seems to realize this and uses
> aio to schedule the close. I was wondering if chardev should
> do this for OPEN/CLOSE events by default (or by request).
> However, I couldn't figure out how to ensure that the device
> remains alive until the callback is actually called.
> 
>       regards   Christian

Cc Alex who's been looking at this.
diff mbox series

Patch

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 13bf5cc47a..28409c90f7 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -168,13 +168,6 @@  static int vhost_user_blk_start(VirtIODevice *vdev, Error **errp)
         goto err_guest_notifiers;
     }
 
-    ret = vhost_dev_start(&s->dev, vdev);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "Error starting vhost");
-        goto err_guest_notifiers;
-    }
-    s->started_vu = true;
-
     /* guest_notifier_mask/pending not used yet, so just unmask
      * everything here. virtio-pci will do the right thing by
      * enabling/disabling irqfd.
@@ -183,9 +176,20 @@  static int vhost_user_blk_start(VirtIODevice *vdev, Error **errp)
         vhost_virtqueue_mask(&s->dev, vdev, i, false);
     }
 
+    s->dev.vq_index_end = s->dev.nvqs;
+    ret = vhost_dev_start(&s->dev, vdev);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Error starting vhost");
+        goto err_guest_notifiers;
+    }
+    s->started_vu = true;
+
     return ret;
 
 err_guest_notifiers:
+    for (i = 0; i < s->dev.nvqs; i++) {
+        vhost_virtqueue_mask(&s->dev, vdev, i, true);
+    }
     k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
 err_host_notifiers:
     vhost_dev_disable_notifiers(&s->dev, vdev);
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index d28f8b974b..d6924f5e57 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -387,21 +387,21 @@  int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
         } else {
             peer = qemu_get_peer(ncs, n->max_queue_pairs);
         }
-        r = vhost_net_start_one(get_vhost_net(peer), dev);
-
-        if (r < 0) {
-            goto err_start;
-        }
 
         if (peer->vring_enable) {
             /* restore vring enable state */
             r = vhost_set_vring_enable(peer, peer->vring_enable);
 
             if (r < 0) {
-                vhost_net_stop_one(get_vhost_net(peer), dev);
                 goto err_start;
             }
         }
+
+        r = vhost_net_start_one(get_vhost_net(peer), dev);
+        if (r < 0) {
+            vhost_net_stop_one(get_vhost_net(peer), dev);
+            goto err_start;
+        }
     }
 
     return 0;