diff mbox

block/file-posix: add bdrv_attach_aio_context callback for host dev and cdrom

Message ID 20180718211256.29774-1-naravamudan@digitalocean.com (mailing list archive)
State New, archived
Headers show

Commit Message

Denis V. Lunev" via July 18, 2018, 9:12 p.m. UTC
In ed6e2161 ("linux-aio: properly bubble up errors from initialzation"),
I only added a bdrv_attach_aio_context callback for the bdrv_file
driver. There are several other drivers that use the shared
aio_plug callback, though, and they will trip the assertion added to
aio_get_linux_aio because they did not call aio_setup_linux_aio first.
Add the appropriate callback definition to the affected driver
definitions.

Fixes: ed6e2161 ("linux-aio: properly bubble up errors from initialization")
Reported-by: Farhan Ali <alifm@linux.ibm.com>
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Cc: Eric Blake <eblake@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: John Snow <jsnow@redhat.com>
Cc: Max Reitz <mreitz@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Fam Zheng <famz@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: qemu-block@nongnu.org
Cc: qemu-devel@nongnu.org
---
 block/file-posix.c | 3 +++
 1 file changed, 3 insertions(+)

Comments

John Snow July 19, 2018, 8:24 p.m. UTC | #1
On 07/18/2018 05:12 PM, Nishanth Aravamudan via Qemu-devel wrote:
> In ed6e2161 ("linux-aio: properly bubble up errors from initialzation"),
> I only added a bdrv_attach_aio_context callback for the bdrv_file
> driver. There are several other drivers that use the shared
> aio_plug callback, though, and they will trip the assertion added to
> aio_get_linux_aio because they did not call aio_setup_linux_aio first.
> Add the appropriate callback definition to the affected driver
> definitions.
> 
> Fixes: ed6e2161 ("linux-aio: properly bubble up errors from initialization")
> Reported-by: Farhan Ali <alifm@linux.ibm.com>
> Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
> Cc: Eric Blake <eblake@redhat.com>
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: John Snow <jsnow@redhat.com>
> Cc: Max Reitz <mreitz@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Fam Zheng <famz@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: qemu-block@nongnu.org
> Cc: qemu-devel@nongnu.org
> ---
>  block/file-posix.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 60af4b3d51..ad299beb38 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -3158,6 +3158,7 @@ static BlockDriver bdrv_host_device = {
>      .bdrv_refresh_limits = raw_refresh_limits,
>      .bdrv_io_plug = raw_aio_plug,
>      .bdrv_io_unplug = raw_aio_unplug,
> +    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>  
>      .bdrv_co_truncate       = raw_co_truncate,
>      .bdrv_getlength	= raw_getlength,
> @@ -3280,6 +3281,7 @@ static BlockDriver bdrv_host_cdrom = {
>      .bdrv_refresh_limits = raw_refresh_limits,
>      .bdrv_io_plug = raw_aio_plug,
>      .bdrv_io_unplug = raw_aio_unplug,
> +    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>  
>      .bdrv_co_truncate    = raw_co_truncate,
>      .bdrv_getlength      = raw_getlength,
> @@ -3410,6 +3412,7 @@ static BlockDriver bdrv_host_cdrom = {
>      .bdrv_refresh_limits = raw_refresh_limits,
>      .bdrv_io_plug = raw_aio_plug,
>      .bdrv_io_unplug = raw_aio_unplug,
> +    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>  
>      .bdrv_co_truncate    = raw_co_truncate,
>      .bdrv_getlength      = raw_getlength,
> 

Seems sane to me, at a glance.

Reviewed-by: John Snow <jsnow@redhat.com>
Farhan Ali July 20, 2018, 7:11 p.m. UTC | #2
I am seeing another issue pop up, in a different test. Even though it's 
a different assertion, it might be related based on the call trace.

Stack trace of thread 276199:
#0  0x000003ff8473e274 raise (libc.so.6)
#1  0x000003ff847239a8 abort (libc.so.6)
#2  0x000003ff847362ce __assert_fail_base (libc.so.6)
#3  0x000003ff8473634c __assert_fail (libc.so.6)
#4  0x000002aa30aba0c4 iov_memset (qemu-system-s390x)
#5  0x000002aa30aba9a6 qemu_iovec_memset (qemu-system-s390x)
#6  0x000002aa30a23e88 qemu_laio_process_completion (qemu-system-s390x)
#7  0x000002aa30a23f68 qemu_laio_process_completions (qemu-system-s390x)
#8  0x000002aa30a2418e qemu_laio_process_completions_and_submit 
(qemu-system-s390x)
#9  0x000002aa30a24220 qemu_laio_poll_cb (qemu-system-s390x)
#10 0x000002aa30ab22c4 run_poll_handlers_once (qemu-system-s390x)
#11 0x000002aa30ab2e78 aio_poll (qemu-system-s390x)
#12 0x000002aa30a29f4e bdrv_do_drained_begin (qemu-system-s390x)
#13 0x000002aa30a2a276 bdrv_drain (qemu-system-s390x)
#14 0x000002aa309d45aa bdrv_set_aio_context (qemu-system-s390x)
#15 0x000002aa3085acfe virtio_blk_data_plane_stop (qemu-system-s390x)
#16 0x000002aa3096994c virtio_bus_stop_ioeventfd.part.1 (qemu-system-s390x)
#17 0x000002aa3087d1d6 virtio_vmstate_change (qemu-system-s390x)
#18 0x000002aa308e8a12 vm_state_notify (qemu-system-s390x)
#19 0x000002aa3080ed54 do_vm_stop (qemu-system-s390x)
#20 0x000002aa307bea04 main (qemu-system-s390x)
#21 0x000003ff84723dd2 __libc_start_main (libc.so.6)
#22 0x000002aa307c0414 _start (qemu-system-s390x)


The failing assertion is:

qemu-kvm: util/iov.c:78: iov_memset: Assertion `offset == 0' failed.

On 07/18/2018 05:12 PM, Nishanth Aravamudan wrote:
> In ed6e2161 ("linux-aio: properly bubble up errors from initialzation"),
> I only added a bdrv_attach_aio_context callback for the bdrv_file
> driver. There are several other drivers that use the shared
> aio_plug callback, though, and they will trip the assertion added to
> aio_get_linux_aio because they did not call aio_setup_linux_aio first.
> Add the appropriate callback definition to the affected driver
> definitions.
> 
> Fixes: ed6e2161 ("linux-aio: properly bubble up errors from initialization")
> Reported-by: Farhan Ali <alifm@linux.ibm.com>
> Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
> Cc: Eric Blake <eblake@redhat.com>
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: John Snow <jsnow@redhat.com>
> Cc: Max Reitz <mreitz@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Fam Zheng <famz@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: qemu-block@nongnu.org
> Cc: qemu-devel@nongnu.org
> ---
>   block/file-posix.c | 3 +++
>   1 file changed, 3 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 60af4b3d51..ad299beb38 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -3158,6 +3158,7 @@ static BlockDriver bdrv_host_device = {
>       .bdrv_refresh_limits = raw_refresh_limits,
>       .bdrv_io_plug = raw_aio_plug,
>       .bdrv_io_unplug = raw_aio_unplug,
> +    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>   
>       .bdrv_co_truncate       = raw_co_truncate,
>       .bdrv_getlength	= raw_getlength,
> @@ -3280,6 +3281,7 @@ static BlockDriver bdrv_host_cdrom = {
>       .bdrv_refresh_limits = raw_refresh_limits,
>       .bdrv_io_plug = raw_aio_plug,
>       .bdrv_io_unplug = raw_aio_unplug,
> +    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>   
>       .bdrv_co_truncate    = raw_co_truncate,
>       .bdrv_getlength      = raw_getlength,
> @@ -3410,6 +3412,7 @@ static BlockDriver bdrv_host_cdrom = {
>       .bdrv_refresh_limits = raw_refresh_limits,
>       .bdrv_io_plug = raw_aio_plug,
>       .bdrv_io_unplug = raw_aio_unplug,
> +    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>   
>       .bdrv_co_truncate    = raw_co_truncate,
>       .bdrv_getlength      = raw_getlength,
>
Denis V. Lunev" via July 20, 2018, 7:32 p.m. UTC | #3
On 20.07.2018 [15:11:14 -0400], Farhan Ali wrote:
> I am seeing another issue pop up, in a different test. Even though it's a
> different assertion, it might be related based on the call trace.

Just to be clear, this does not happen if you revert the original patch
(i.e., the one you bisected to before)?

I'm digging into the code now.

-Nish
Farhan Ali July 20, 2018, 7:42 p.m. UTC | #4
On 07/20/2018 03:32 PM, Nishanth Aravamudan wrote:
> On 20.07.2018 [15:11:14 -0400], Farhan Ali wrote:
>> I am seeing another issue pop up, in a different test. Even though it's a
>> different assertion, it might be related based on the call trace.
> 
> Just to be clear, this does not happen if you revert the original patch
> (i.e., the one you bisected to before)?
> 
> I'm digging into the code now.
> 
> -Nish
> 
> 
I had not seen this issue before. I just ran my regression tests with 
your fix and saw the failure in one of my test. The patch in itself 
fixes the original issue I reported. I am going to try and debug some more.

Thanks
Farhan
Farhan Ali July 23, 2018, 1:34 p.m. UTC | #5
On 07/20/2018 03:11 PM, Farhan Ali wrote:
> I am seeing another issue pop up, in a different test. Even though it's 
> a different assertion, it might be related based on the call trace.
> 
> Stack trace of thread 276199:
> #0  0x000003ff8473e274 raise (libc.so.6)
> #1  0x000003ff847239a8 abort (libc.so.6)
> #2  0x000003ff847362ce __assert_fail_base (libc.so.6)
> #3  0x000003ff8473634c __assert_fail (libc.so.6)
> #4  0x000002aa30aba0c4 iov_memset (qemu-system-s390x)
> #5  0x000002aa30aba9a6 qemu_iovec_memset (qemu-system-s390x)
> #6  0x000002aa30a23e88 qemu_laio_process_completion (qemu-system-s390x)
> #7  0x000002aa30a23f68 qemu_laio_process_completions (qemu-system-s390x)
> #8  0x000002aa30a2418e qemu_laio_process_completions_and_submit 
> (qemu-system-s390x)
> #9  0x000002aa30a24220 qemu_laio_poll_cb (qemu-system-s390x)
> #10 0x000002aa30ab22c4 run_poll_handlers_once (qemu-system-s390x)
> #11 0x000002aa30ab2e78 aio_poll (qemu-system-s390x)
> #12 0x000002aa30a29f4e bdrv_do_drained_begin (qemu-system-s390x)
> #13 0x000002aa30a2a276 bdrv_drain (qemu-system-s390x)
> #14 0x000002aa309d45aa bdrv_set_aio_context (qemu-system-s390x)
> #15 0x000002aa3085acfe virtio_blk_data_plane_stop (qemu-system-s390x)
> #16 0x000002aa3096994c virtio_bus_stop_ioeventfd.part.1 (qemu-system-s390x)
> #17 0x000002aa3087d1d6 virtio_vmstate_change (qemu-system-s390x)
> #18 0x000002aa308e8a12 vm_state_notify (qemu-system-s390x)
> #19 0x000002aa3080ed54 do_vm_stop (qemu-system-s390x)
> #20 0x000002aa307bea04 main (qemu-system-s390x)
> #21 0x000003ff84723dd2 __libc_start_main (libc.so.6)
> #22 0x000002aa307c0414 _start (qemu-system-s390x)
> 
> 
> The failing assertion is:
> 
> qemu-kvm: util/iov.c:78: iov_memset: Assertion `offset == 0' failed.
> 

Just to give some context, this a guest with 2 disks with each assigned 
an iothread. The guest was running a memory intensive workload.

 From the coredump of the qemu process, I see there were 2 threads that 
were trying to call aio_poll with the same AioContext on the same 
BlockDeviceDriver

Thread 1:

#0  0x000003ff8473e274 in raise () from /lib64/libc.so.6
#1  0x000003ff847239a8 in abort () from /lib64/libc.so.6
#2  0x000003ff847362ce in __assert_fail_base () from /lib64/libc.so.6
#3  0x000003ff8473634c in __assert_fail () from /lib64/libc.so.6
#4  0x000002aa30aba0c4 in iov_memset (iov=<optimized out>, 
iov_cnt=<optimized out>, offset=<optimized out>, fillc=<optimized out>, 
bytes=18446744073709547520) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/iov.c:78
#5  0x000002aa30aba9a6 in qemu_iovec_memset (qiov=<optimized out>, 
offset=offset@entry=8192, fillc=fillc@entry=0, 
bytes=18446744073709547520) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/iov.c:410
#6  0x000002aa30a23e88 in qemu_laio_process_completion 
(laiocb=0x3fe36a6a3f0) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/linux-aio.c:88
#7  0x000002aa30a23f68 in qemu_laio_process_completions 
(s=s@entry=0x3fe60001910) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/linux-aio.c:222
#8  0x000002aa30a2418e in qemu_laio_process_completions_and_submit 
(s=0x3fe60001910) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/linux-aio.c:237
#9  0x000002aa30a24220 in qemu_laio_poll_cb (opaque=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/linux-aio.c:272
#10 0x000002aa30ab22c4 in run_poll_handlers_once 
(ctx=ctx@entry=0x2aa4f35df50) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/aio-posix.c:494
#11 0x000002aa30ab2e78 in try_poll_mode (blocking=<optimized out>, 
ctx=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/aio-posix.c:573
====>  #12 aio_poll (ctx=0x2aa4f35df50, blocking=blocking@entry=false) 
at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/aio-posix.c:602
#13 0x000002aa30a29f4e in bdrv_drain_poll_top_level 
(ignore_parent=<optimized out>, recursive=<optimized out>, bs=<optimized 
out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/io.c:390
#14 bdrv_do_drained_begin (bs=0x2aa4f392510, recursive=<optimized out>, 
parent=0x0, ignore_bds_parents=<optimized out>, poll=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/io.c:390
#15 0x000002aa30a2a276 in bdrv_drained_begin (bs=0x2aa4f392510) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/io.c:396
#16 bdrv_drain (bs=bs@entry=0x2aa4f392510) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/io.c:478
#17 0x000002aa309d45aa in bdrv_set_aio_context (bs=0x2aa4f392510, 
new_context=0x2aa4f3594f0) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block.c:4954
#18 0x000002aa30a1c228 in blk_set_aio_context 
(blk=blk@entry=0x2aa4f38ed90, new_context=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/block-backend.c:1894
#19 0x000002aa3085acfe in virtio_blk_data_plane_stop (vdev=<optimized 
out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/hw/block/dataplane/virtio-blk.c:285
#20 0x000002aa3096994c in virtio_bus_stop_ioeventfd (bus=0x2aa4f4f61f0) 
at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/hw/virtio/virtio-bus.c:246
#21 0x000002aa3087d1d6 in virtio_vmstate_change (opaque=0x2aa4f4f72b8, 
running=<optimized out>, state=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/hw/virtio/virtio.c:2222
#22 0x000002aa308e8a12 in vm_state_notify (running=<optimized out>, 
state=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/vl.c:1532
#23 0x000002aa3080ed54 in do_vm_stop (state=<optimized out>, 
send_stop=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/cpus.c:1012
#24 0x000002aa307bea04 in main (argc=<optimized out>, argv=<optimized 
out>, envp=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/vl.c:4649


Thread 2 which is an IOThread:

#0  0x000003ff84910f9e in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x000003ff8490a1a2 in pthread_mutex_lock () from /lib64/libpthread.so.0
#2  0x000002aa30ab4cea in qemu_mutex_lock_impl (mutex=0x2aa4f35dfb0, 
file=file@entry=0x2aa30b963f4 
"/builddir/build/BUILD/qemu-2.12.91/util/async.c", line=line@entry=511)
     at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/qemu-thread-posix.c:66
#3  0x000002aa30aafff4 in aio_context_acquire (ctx=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/async.c:511
#4  0x000002aa30a2419a in qemu_laio_process_completions_and_submit 
(s=0x3fe60001910) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/block/linux-aio.c:239
#5  0x000002aa30ab23ee in aio_dispatch_handlers 
(ctx=ctx@entry=0x2aa4f35df50) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/aio-posix.c:406
=====> #6  0x000002aa30ab30b4 in aio_poll (ctx=0x2aa4f35df50, 
blocking=blocking@entry=true) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/util/aio-posix.c:692
#7  0x000002aa308e2322 in iothread_run (opaque=0x2aa4f35d5c0) at 
/usr/src/debug/qemu-2.12.91-20180720.0.677af45304.fc28.s390x/iothread.c:63
#8  0x000003ff849079a8 in start_thread () from /lib64/libpthread.so.0
#9  0x000003ff847f97ee in thread_start () from /lib64/libc.so.6


This looked a little suspicious to me, I don't if this the expected 
behavior or there is a race condition here. Any help debugging this 
would be greatly appreciated.

Thanks
Farhan
Stefan Hajnoczi July 23, 2018, 4:25 p.m. UTC | #6
On Wed, Jul 18, 2018 at 02:12:56PM -0700, Nishanth Aravamudan wrote:
> In ed6e2161 ("linux-aio: properly bubble up errors from initialzation"),
> I only added a bdrv_attach_aio_context callback for the bdrv_file
> driver. There are several other drivers that use the shared
> aio_plug callback, though, and they will trip the assertion added to
> aio_get_linux_aio because they did not call aio_setup_linux_aio first.
> Add the appropriate callback definition to the affected driver
> definitions.
> 
> Fixes: ed6e2161 ("linux-aio: properly bubble up errors from initialization")
> Reported-by: Farhan Ali <alifm@linux.ibm.com>
> Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
> Cc: Eric Blake <eblake@redhat.com>
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: John Snow <jsnow@redhat.com>
> Cc: Max Reitz <mreitz@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Cc: Fam Zheng <famz@redhat.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: qemu-block@nongnu.org
> Cc: qemu-devel@nongnu.org
> ---
>  block/file-posix.c | 3 +++
>  1 file changed, 3 insertions(+)

This patch looks good.  The remaining s390 crash can be debugged
separately.

The FreeBSD host_cdrom change isn't strictly necessary (no Linux AIO on
FreeBSD), but it doesn't hurt to call raw_aio_attach_aio_context() for
consistency.

Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block

Stefan
Stefan Hajnoczi July 23, 2018, 4:30 p.m. UTC | #7
On Fri, Jul 20, 2018 at 03:11:14PM -0400, Farhan Ali wrote:
> I am seeing another issue pop up, in a different test. Even though it's a
> different assertion, it might be related based on the call trace.

Which test case?

> Stack trace of thread 276199:
> #0  0x000003ff8473e274 raise (libc.so.6)
> #1  0x000003ff847239a8 abort (libc.so.6)
> #2  0x000003ff847362ce __assert_fail_base (libc.so.6)
> #3  0x000003ff8473634c __assert_fail (libc.so.6)
> #4  0x000002aa30aba0c4 iov_memset (qemu-system-s390x)
> #5  0x000002aa30aba9a6 qemu_iovec_memset (qemu-system-s390x)
> #6  0x000002aa30a23e88 qemu_laio_process_completion (qemu-system-s390x)

What are the values of laiocb->qiov->size and laiocb->ret?

> #7  0x000002aa30a23f68 qemu_laio_process_completions (qemu-system-s390x)
> #8  0x000002aa30a2418e qemu_laio_process_completions_and_submit
> (qemu-system-s390x)
> #9  0x000002aa30a24220 qemu_laio_poll_cb (qemu-system-s390x)
> #10 0x000002aa30ab22c4 run_poll_handlers_once (qemu-system-s390x)
> #11 0x000002aa30ab2e78 aio_poll (qemu-system-s390x)
> #12 0x000002aa30a29f4e bdrv_do_drained_begin (qemu-system-s390x)
> #13 0x000002aa30a2a276 bdrv_drain (qemu-system-s390x)
> #14 0x000002aa309d45aa bdrv_set_aio_context (qemu-system-s390x)
> #15 0x000002aa3085acfe virtio_blk_data_plane_stop (qemu-system-s390x)
> #16 0x000002aa3096994c virtio_bus_stop_ioeventfd.part.1 (qemu-system-s390x)
> #17 0x000002aa3087d1d6 virtio_vmstate_change (qemu-system-s390x)
> #18 0x000002aa308e8a12 vm_state_notify (qemu-system-s390x)
> #19 0x000002aa3080ed54 do_vm_stop (qemu-system-s390x)
> #20 0x000002aa307bea04 main (qemu-system-s390x)
> #21 0x000003ff84723dd2 __libc_start_main (libc.so.6)
> #22 0x000002aa307c0414 _start (qemu-system-s390x)
> 
> 
> The failing assertion is:
> 
> qemu-kvm: util/iov.c:78: iov_memset: Assertion `offset == 0' failed.

I wonder if the offset is beyond the end of the iovecs.

Thanks,
Stefan
Farhan Ali July 23, 2018, 4:42 p.m. UTC | #8
On 07/23/2018 12:30 PM, Stefan Hajnoczi wrote:
> On Fri, Jul 20, 2018 at 03:11:14PM -0400, Farhan Ali wrote:
>> I am seeing another issue pop up, in a different test. Even though it's a
>> different assertion, it might be related based on the call trace.
> 
> Which test case?

This test case involved one guest with 2 disks, with an iothread for 
each disk. The guest was running a memory workload.

> 
>> Stack trace of thread 276199:
>> #0  0x000003ff8473e274 raise (libc.so.6)
>> #1  0x000003ff847239a8 abort (libc.so.6)
>> #2  0x000003ff847362ce __assert_fail_base (libc.so.6)
>> #3  0x000003ff8473634c __assert_fail (libc.so.6)
>> #4  0x000002aa30aba0c4 iov_memset (qemu-system-s390x)
>> #5  0x000002aa30aba9a6 qemu_iovec_memset (qemu-system-s390x)
>> #6  0x000002aa30a23e88 qemu_laio_process_completion (qemu-system-s390x)
> 
> What are the values of laiocb->qiov->size and laiocb->ret?

The laiocb->qiov->size was 4096 and laiocb->ret was 8192

> 
>> #7  0x000002aa30a23f68 qemu_laio_process_completions (qemu-system-s390x)
>> #8  0x000002aa30a2418e qemu_laio_process_completions_and_submit
>> (qemu-system-s390x)
>> #9  0x000002aa30a24220 qemu_laio_poll_cb (qemu-system-s390x)
>> #10 0x000002aa30ab22c4 run_poll_handlers_once (qemu-system-s390x)
>> #11 0x000002aa30ab2e78 aio_poll (qemu-system-s390x)
>> #12 0x000002aa30a29f4e bdrv_do_drained_begin (qemu-system-s390x)
>> #13 0x000002aa30a2a276 bdrv_drain (qemu-system-s390x)
>> #14 0x000002aa309d45aa bdrv_set_aio_context (qemu-system-s390x)
>> #15 0x000002aa3085acfe virtio_blk_data_plane_stop (qemu-system-s390x)
>> #16 0x000002aa3096994c virtio_bus_stop_ioeventfd.part.1 (qemu-system-s390x)
>> #17 0x000002aa3087d1d6 virtio_vmstate_change (qemu-system-s390x)
>> #18 0x000002aa308e8a12 vm_state_notify (qemu-system-s390x)
>> #19 0x000002aa3080ed54 do_vm_stop (qemu-system-s390x)
>> #20 0x000002aa307bea04 main (qemu-system-s390x)
>> #21 0x000003ff84723dd2 __libc_start_main (libc.so.6)
>> #22 0x000002aa307c0414 _start (qemu-system-s390x)
>>
>>
>> The failing assertion is:
>>
>> qemu-kvm: util/iov.c:78: iov_memset: Assertion `offset == 0' failed.
> 
> I wonder if the offset is beyond the end of the iovecs.
> 
> Thanks,
> Stefan
> 

Thanks
Farhan
Stefan Hajnoczi July 27, 2018, 1:26 p.m. UTC | #9
On Mon, Jul 23, 2018 at 12:42:02PM -0400, Farhan Ali wrote:
> 
> 
> On 07/23/2018 12:30 PM, Stefan Hajnoczi wrote:
> > On Fri, Jul 20, 2018 at 03:11:14PM -0400, Farhan Ali wrote:
> > > I am seeing another issue pop up, in a different test. Even though it's a
> > > different assertion, it might be related based on the call trace.
> > 
> > Which test case?
> 
> This test case involved one guest with 2 disks, with an iothread for each
> disk. The guest was running a memory workload.

Please post a link to the test case.

Stefan
Farhan Ali July 27, 2018, 2:54 p.m. UTC | #10
On 07/27/2018 09:26 AM, Stefan Hajnoczi wrote:
> On Mon, Jul 23, 2018 at 12:42:02PM -0400, Farhan Ali wrote:
>>
>>
>> On 07/23/2018 12:30 PM, Stefan Hajnoczi wrote:
>>> On Fri, Jul 20, 2018 at 03:11:14PM -0400, Farhan Ali wrote:
>>>> I am seeing another issue pop up, in a different test. Even though it's a
>>>> different assertion, it might be related based on the call trace.
>>>
>>> Which test case?
>>
>> This test case involved one guest with 2 disks, with an iothread for each
>> disk. The guest was running a memory workload.
> 
> Please post a link to the test case.
> 
> Stefan
> 
Hi Stefan,

Thanks for your response. The test case was run in our internal 
infrastructure, so unfortunately I cannot post any link to the test case.

I have been unable to reproduce this exact issue. On the other hand the 
same test case is throwing another assertion error:

qemu-kvm: /builddir/build/BUILD/qemu-2.12.91/exec.c:3695: 
address_space_unmap: Assertion `mr != NULL' failed.

Again this to me is very strange error. I have been able to reproduce 
this assertion couple of times, though I don't hit it on every run of 
the test case.


The qemu command line for the test case:

/usr/bin/qemu-kvm -name guest=sles,debug-threads=on -S -object 
secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-6-sles/master-key.aes 
-machine s390-ccw-virtio-3.0,accel=kvm,usb=off,dump-guest-core=off -m 
4096 -realtime mlock=off -smp 8,sockets=8,cores=1,threads=1 -object 
iothread,id=iothread1 -object iothread,id=iothread2 -uuid 
094a20ff-e881-44db-a772-fb4029cf8f09 -display none -no-user-config 
-nodefaults -chardev socket,id=charmonitor,fd=28,server,nowait -mon 
chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown 
-boot strict=on -drive 
file=/dev/mapper/360050763998b0883980000003300003a,format=raw,if=none,id=drive-virtio-disk0,cache=none,aio=native 
-device 
virtio-blk-ccw,iothread=iothread1,scsi=off,devno=fe.0.0001,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=on 
-drive 
file=/dev/mapper/360050763998b0883980000001e000025,format=raw,if=none,id=drive-virtio-disk1,cache=none,aio=native 
-device 
virtio-blk-ccw,iothread=iothread2,scsi=off,devno=fe.0.0002,drive=drive-virtio-disk1,id=virtio-disk1,write-cache=on 
-netdev tap,fd=30,id=hostnet0,vhost=on,vhostfd=31 -device 
virtio-net-ccw,netdev=hostnet0,id=net0,mac=02:f5:5d:7d:7d:ef,devno=fe.0.0000 
-chardev pty,id=charconsole0 -device 
sclpconsole,chardev=charconsole0,id=console0 -device 
virtio-balloon-ccw,id=balloon0,devno=fe.3.ffba -sandbox 
on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny 
-msg timestamp=on


The backtrace of the thread which trips the assertion:

#0  0x000003ffb553e274 in raise () from /lib64/libc.so.6
#1  0x000003ffb55239a8 in abort () from /lib64/libc.so.6
#2  0x000003ffb55362ce in __assert_fail_base () from /lib64/libc.so.6
#3  0x000003ffb553634c in __assert_fail () from /lib64/libc.so.6
#4  0x000002aa39b487e6 in address_space_unmap (as=<optimized out>, 
buffer=<optimized out>, len=<optimized out>, is_write=<optimized out>, 
access_len=<optimized out>)
     at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/exec.c:3695
#5  0x000002aa39bfb550 in dma_memory_unmap (access_len=0, 
dir=DMA_DIRECTION_FROM_DEVICE, len=<optimized out>, buffer=<optimized 
out>, as=0x2aa3a085b80 <address_space_memory>)
     at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/include/sysemu/dma.h:146
#6  virtqueue_unmap_sg (elem=elem@entry=0x3ff9c01c9d0, len=len@entry=1, 
vq=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/hw/virtio/virtio.c:401
#7  0x000002aa39bfc840 in virtqueue_fill (vq=vq@entry=0x3ffb6c9e010, 
elem=0x3ff9c01c9d0, len=<optimized out>, idx=idx@entry=0) at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/hw/virtio/virtio.c:476
#8  0x000002aa39bfcb80 in virtqueue_push (vq=0x3ffb6c9e010, 
elem=elem@entry=0x3ff9c01c9d0, len=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/hw/virtio/virtio.c:522
#9  0x000002aa39bd78c2 in virtio_blk_req_complete (req=0x3ff9c01c9d0, 
status=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/hw/block/virtio-blk.c:55
#10 0x000002aa39bd8540 in virtio_blk_rw_complete (opaque=<optimized 
out>, ret=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/hw/block/virtio-blk.c:121
#11 0x000002aa39d9a15e in blk_aio_complete (acb=0x3ff9c04f670) at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/block/block-backend.c:1336
#12 0x000002aa39e48e98 in coroutine_trampoline (i0=<optimized out>, 
i1=<optimized out>) at 
/usr/src/debug/qemu-2.12.91-20180726.0.c6cf862329.fc28.s390x/util/coroutine-ucontext.c:116
#13 0x000003ffb5553b7a in __makecontext_ret () from /lib64/libc.so.6


I don't know if these issues are related to the same underlying problem.
Any help is really appreciated.

Thanks
Farhan
diff mbox

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 60af4b3d51..ad299beb38 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -3158,6 +3158,7 @@  static BlockDriver bdrv_host_device = {
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
+    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate       = raw_co_truncate,
     .bdrv_getlength	= raw_getlength,
@@ -3280,6 +3281,7 @@  static BlockDriver bdrv_host_cdrom = {
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
+    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate    = raw_co_truncate,
     .bdrv_getlength      = raw_getlength,
@@ -3410,6 +3412,7 @@  static BlockDriver bdrv_host_cdrom = {
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
+    .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
     .bdrv_co_truncate    = raw_co_truncate,
     .bdrv_getlength      = raw_getlength,