diff mbox

[xen-unstable,test] 113959: regressions - FAIL

Message ID 20171009075807.GA10403@op-computing (mailing list archive)
State New, archived
Headers show

Commit Message

Chao Gao Oct. 9, 2017, 7:58 a.m. UTC
On Mon, Oct 09, 2017 at 02:13:22PM +0800, Chao Gao wrote:
>On Tue, Oct 03, 2017 at 11:08:01AM +0100, Roger Pau Monné wrote:
>>On Tue, Oct 03, 2017 at 09:55:44AM +0000, osstest service owner wrote:
>>> flight 113959 xen-unstable real [real]
>>> http://logs.test-lab.xenproject.org/osstest/logs/113959/
>>> 
>>> Regressions :-(
>>> 
>>> Tests which did not succeed and are blocking,
>>> including tests which could not be run:
>>>  test-amd64-i386-libvirt-xsm  21 leak-check/check         fail REGR. vs. 113954
>>
>>This is due to cron running when the leak-check is executed.
>>
>>>  test-armhf-armhf-xl-multivcpu  5 host-ping-check-native  fail REGR. vs. 113954
>>>  test-amd64-i386-xl-qemut-debianhvm-amd64 17 guest-stop   fail REGR. vs. 113954
>>
>>The test below has triggered the following ASSERT, CCing the Intel
>>guys.
>>
>>Oct  3 06:12:00.415168 (XEN) d15v0: intack: 2:30 pt: 38
>>Oct  3 06:12:19.191141 (XEN) vIRR: 00000000 00000000 00000000 00000000 00000000 00000000 00010000 00000000
>>Oct  3 06:12:19.199162 (XEN)  PIR: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
>>Oct  3 06:12:19.207160 (XEN) Assertion 'intack.vector >= pt_vector' failed at intr.c:367
>>Oct  3 06:12:19.215215 (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Not tainted ]----
>>Oct  3 06:12:19.223124 (XEN) CPU:    1
>>Oct  3 06:12:19.223153 (XEN) RIP:    e008:[<ffff82d0803022a5>] vmx_intr_assist+0x617/0x637
>>Oct  3 06:12:19.231185 (XEN) RFLAGS: 0000000000010292   CONTEXT: hypervisor (d15v0)
>>Oct  3 06:12:19.239163 (XEN) rax: ffff83022dfc802c   rbx: ffff8300ccc65680   rcx: 0000000000000000
>>Oct  3 06:12:19.247169 (XEN) rdx: ffff83022df7ffff   rsi: 000000000000000a   rdi: ffff82d0804606d8
>>Oct  3 06:12:19.255127 (XEN) rbp: ffff83022df7ff08   rsp: ffff83022df7fea8   r8:  ffff83022df90000
>>Oct  3 06:12:19.263114 (XEN) r9:  0000000000000001   r10: 0000000000000000   r11: 0000000000000001
>>Oct  3 06:12:19.271109 (XEN) r12: 00000000ffffffff   r13: ffff82d0803cfba6   r14: ffff82d0803cfba6
>>Oct  3 06:12:19.279119 (XEN) r15: 0000000000000004   cr0: 0000000080050033   cr4: 00000000001526e0
>>Oct  3 06:12:19.279157 (XEN) cr3: 0000000214274000   cr2: 00005622a2184dbf
>>Oct  3 06:12:19.287123 (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
>>Oct  3 06:12:19.295105 (XEN) Xen code around <ffff82d0803022a5> (vmx_intr_assist+0x617/0x637):
>>Oct  3 06:12:19.303150 (XEN)  41 bf 00 00 00 00 eb a0 <0f> 0b 89 ce 48 89 df e8 bb 20 00 00 e9 49 fe ff
>>Oct  3 06:12:19.311112 (XEN) Xen stack trace from rsp=ffff83022df7fea8:
>>Oct  3 06:12:19.311146 (XEN)    ffff83022df7ff08 000000388030cf76 ffff82d0805a7570 ffff82d08057ad80
>>Oct  3 06:12:19.319131 (XEN)    ffff83022df7ffff ffff83022df7fee0 ffff82d08023b9b6 ffff8300ccc65000
>>Oct  3 06:12:19.327115 (XEN)    000000000000000b 0000000000000020 00000000000000c2 0000000000000004
>>Oct  3 06:12:19.345094 (XEN)    ffff880029eb4000 ffff82d080311c21 0000000000000004 00000000000000c2
>>Oct  3 06:12:19.345177 (XEN)    0000000000000020 000000000000000b ffff880029eb4000 ffffffff81adf0a0
>>Oct  3 06:12:19.351221 (XEN)    0000000000000000 0000000000000000 ffff88002d400008 0000000000000000
>>Oct  3 06:12:19.359439 (XEN)    0000000000000030 0000000000000000 00000000000003f8 00000000000003f8
>>Oct  3 06:12:19.367267 (XEN)    ffffffff81adf0a0 0000beef0000beef ffffffff8138a5f4 000000bf0000beef
>>Oct  3 06:12:19.375222 (XEN)    0000000000000002 ffff88002f803e08 000000000000beef 000000000000beef
>>Oct  3 06:12:19.383198 (XEN)    000000000000beef 000000000000beef 000000000000beef 0000000000000001
>>Oct  3 06:12:19.391230 (XEN)    ffff8300ccc65000 00000031ada20d00 00000000001526e0
>>Oct  3 06:12:19.399336 (XEN) Xen call trace:
>>Oct  3 06:12:19.399389 (XEN)    [<ffff82d0803022a5>] vmx_intr_assist+0x617/0x637
>>Oct  3 06:12:19.407337 (XEN)    [<ffff82d080311c21>] vmx_asm_vmexit_handler+0x41/0x120
>>Oct  3 06:12:19.407380 (XEN) 
>>Oct  3 06:12:19.415246 (XEN) 
>>Oct  3 06:12:19.415278 (XEN) ****************************************
>>Oct  3 06:12:19.415307 (XEN) Panic on CPU 1:
>>Oct  3 06:12:19.415332 (XEN) Assertion 'intack.vector >= pt_vector' failed at intr.c:367
>>Oct  3 06:12:19.423432 (XEN) ****************************************
>
>(CC Jan)
>
>Hi, Roger.
>
>I sent a patch to fix a possible cause of this bug, seeing
>https://lists.xenproject.org/archives/html/xen-devel/2017-04/msg03254.html.
>
>Due to Xen 4.9 release, I put this patch aside and later forgot to
>continue fixing this bug. Sorry for this. Of course, I will fix this
>bug.
>
>I thought the root case was:
>When injecting periodic timer interrupt in vmx_intr_assist(),
>multi-read operations are done during one event delivery. For
>example, if a periodic timer interrupt is from PIT, when set the
>corresponding bit in vIRR, the corresponding RTE is accessed in
>pt_update_irq(). When this function returns, it accesses the RTE
>again to get the vector it sets in vIRR.  Between the two
>accesses, the content of RTE may have been changed by another CPU
>for no protection method in use. This case can incur the
>assertion failure in vmx_intr_assist().
>
>For example, in this case, we may set 0x30 in vIRR, but return 0x38 to
>vmx_intr_assist(). When we try to inject an interrupt, we would find
>0x38 is greater than the highest vector; then the assertion failure
>happened. I have a xtf case to reproduce this bug, seeing
>https://lists.xenproject.org/archives/html/xen-devel/2017-03/msg02906.html.
>But according to Jan's opinion, he thought the bug was unlikely
>triggered in OSSTEST by these weird operations.
>
>After thinking over it, the bug also can be caused by pt_update_irq()
>returns 0x38 but it doesn't set 0x38 in vIRR for the corresponding RTE
>is masked. Please refer to the code path:
>vmx_intr_assist() -> pt_update_irq() -> hvm_isa_irq_assert() ->
>assert_irq() -> assert_gsi() -> vioapic_irq_positive_edge().
>Note that in vioapic_irq_positive_edge(), if ent->fields.mask is set,
>the function returns without setting the corresponding bit in vIRR.

To verify this guess, I modify the above xtf a little. The new xtf test
(enclosed in attachment) Create a guest with 2 vCPU. vCPU0 sets up PIT
to generate timer interrupt every 1ms. It also boots up vCPU1. vCPU1
incessantly masks/unmasks the corresponding IOAPIC RTE and sends IPI
(vector 0x30) to vCPU0. The bug happens as expected:

(d2) [ 1146.910517] --- Xen Test Framework ---
(d2) [ 1146.910701] Environment: HVM 32bit (No paging)
(d2) [ 1146.910823] Test periodic-timer-2
(d2) [ 1146.911007] activate cpu1
(XEN) [ 1146.953514] printk: 27 messages suppressed.
(XEN) [ 1146.953517] irq2 masked
(XEN) [ 1146.953523] d2v0: intack: 2:30 pt: 38
(XEN) [ 1146.953525] vIRR: 00000000 00000000 00000000 00000000 00000000 00000000 00010000 00000000
(XEN) [ 1146.953530]  PIR: 00000000 00000000 00000000 00000000 00000000 00000000 00010000 00000000
(XEN) [ 1146.953555] Assertion 'intack.vector >= pt_vector' failed at intr.c:367
(XEN) [ 1146.953560] ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
(XEN) [ 1146.953562] CPU:    32
(XEN) [ 1146.953564] RIP:    e008:[<ffff82d0802fc62d>] vmx_intr_assist+0x62d/0x63e
(XEN) [ 1146.953571] RFLAGS: 0000000000010296   CONTEXT: hypervisor (d2v0)
(XEN) [ 1146.953575] rax: 0000000000000200   rbx: ffff83006f927680   rcx: 0000000000000000
(XEN) [ 1146.953578] rdx: ffff83207b20ffff   rsi: 000000000000000a   rdi: ffff82d0804526b8
(XEN) [ 1146.953581] rbp: ffff83207b20ff08   rsp: ffff83207b20fe98   r8:  ffff83107e000000
(XEN) [ 1146.953583] r9:  0000000000000001   r10: 0000000000000000   r11: 0000000000000001
(XEN) [ 1146.953586] r12: 00000000ffffffff   r13: ffff82d0803aafb3   r14: ffff82c00032f200
(XEN) [ 1146.953589] r15: 0000000000004016   cr0: 000000008005003b   cr4: 00000000001526e0
(XEN) [ 1146.953592] cr3: 000000104ebed000   cr2: 0000000000000000
(XEN) [ 1146.953594] ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
(XEN) [ 1146.953598] Xen code around <ffff82d0802fc62d> (vmx_intr_assist+0x62d/0x63e):
(XEN) [ 1146.953601]  20 00 00 e9 64 fe ff ff <0f> 0b 48 83 c4 48 5b 41 5c 41 5d 41 5e 41 5f 5d
(XEN) [ 1146.953610] Xen stack trace from rsp=ffff83207b20fe98:
(XEN) [ 1146.953612]    ffff83207b20ff08 ffff82d08046bc00 ffff82d000000030 ffffffffffffffff
(XEN) [ 1146.953616]    ffff83207b20ffff 0000000000000000 ffff83207b20fef8 ffff82d080238c5f
(XEN) [ 1146.953620]    ffff83006f927000 ffff83006f927000 ffff83006f927000 ffff831072cac188
(XEN) [ 1146.953624]    0000010b0bc16022 0000000001c9c380 ffff83207b20fe28 ffff82d08030be01
(XEN) [ 1146.953628]    0000000000000000 0000000000000000 0000000000000000 0000000000000000
(XEN) [ 1146.953631]    0000000000000000 00000000fee00000 0000000000000000 0000000000000000
(XEN) [ 1146.953634]    0000000000000000 0000000000000000 0000000000000004 0000000000000000
(XEN) [ 1146.953637]    0000000000000001 0000000000000000 0000000000000000 0000beef0000beef
(XEN) [ 1146.953640]    0000000000102b58 000000bf0000beef 0000000000000206 0000000000114fa0
(XEN) [ 1146.953643]    000000000000beef 000000000000beef 000000000000beef 000000000000beef
(XEN) [ 1146.953647]    000000000000beef 0000000000000020 ffff83006f927000 0000003ff2815080
(XEN) [ 1146.953650]    00000000001526e0
(XEN) [ 1146.953652] Xen call trace:
(XEN) [ 1146.953655]    [<ffff82d0802fc62d>] vmx_intr_assist+0x62d/0x63e
(XEN) [ 1146.953662]    [<ffff82d08030be01>] vmx_asm_vmexit_handler+0x41/0x120
(XEN) [ 1146.953663] 
(XEN) [ 1147.238180] 
(XEN) [ 1147.240407] ****************************************
(XEN) [ 1147.246515] Panic on CPU 32:
(XEN) [ 1147.250299] Assertion 'intack.vector >= pt_vector' failed at intr.c:367
(XEN) [ 1147.258249] ****************************************
(XEN) [ 1147.264366] 
(XEN) [ 1147.266585] Reboot in five seconds...

Thanks
Chao
From 715814743e59990068791ccd18b578e23877fc34 Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Mon, 9 Oct 2017 08:44:07 +0800
Subject: [PATCH] xtf: reproduce [xen-unstable test] 106504: regressions

Create a guest with 2 vCPU. vCPU0 sets up PIT to generate timer
interrupt every 1ms. It also boots up vCPU1. vCPU1 incessantly
masks/unmasks the corresponding IOAPIC RTE and sends IPI (vector 0x30)
to vCPU0.

Signed-off-by: Chao Gao <chao.gao@intel.com>
---
 tests/periodic-timer-2/Makefile     |  11 ++
 tests/periodic-timer-2/entry.S      |  15 ++
 tests/periodic-timer-2/extra.cfg.in |   1 +
 tests/periodic-timer-2/main.c       | 294 ++++++++++++++++++++++++++++++++++++
 4 files changed, 321 insertions(+)
 create mode 100644 tests/periodic-timer-2/Makefile
 create mode 100644 tests/periodic-timer-2/entry.S
 create mode 100644 tests/periodic-timer-2/extra.cfg.in
 create mode 100644 tests/periodic-timer-2/main.c

Comments

Andrew Cooper Oct. 9, 2017, 11:03 a.m. UTC | #1
On 09/10/17 08:58, Chao Gao wrote:
> On Mon, Oct 09, 2017 at 02:13:22PM +0800, Chao Gao wrote:
>> On Tue, Oct 03, 2017 at 11:08:01AM +0100, Roger Pau Monné wrote:
>>> On Tue, Oct 03, 2017 at 09:55:44AM +0000, osstest service owner wrote:
>>>> flight 113959 xen-unstable real [real]
>>>> http://logs.test-lab.xenproject.org/osstest/logs/113959/
>>>>
>>>> Regressions :-(
>>>>
>>>> Tests which did not succeed and are blocking,
>>>> including tests which could not be run:
>>>>  test-amd64-i386-libvirt-xsm  21 leak-check/check         fail REGR. vs. 113954
>>> This is due to cron running when the leak-check is executed.
>>>
>>>>  test-armhf-armhf-xl-multivcpu  5 host-ping-check-native  fail REGR. vs. 113954
>>>>  test-amd64-i386-xl-qemut-debianhvm-amd64 17 guest-stop   fail REGR. vs. 113954
>>> The test below has triggered the following ASSERT, CCing the Intel
>>> guys.
>>>
>>> Oct  3 06:12:00.415168 (XEN) d15v0: intack: 2:30 pt: 38
>>> Oct  3 06:12:19.191141 (XEN) vIRR: 00000000 00000000 00000000 00000000 00000000 00000000 00010000 00000000
>>> Oct  3 06:12:19.199162 (XEN)  PIR: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
>>> Oct  3 06:12:19.207160 (XEN) Assertion 'intack.vector >= pt_vector' failed at intr.c:367
>>> Oct  3 06:12:19.215215 (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Not tainted ]----
>>> Oct  3 06:12:19.223124 (XEN) CPU:    1
>>> Oct  3 06:12:19.223153 (XEN) RIP:    e008:[<ffff82d0803022a5>] vmx_intr_assist+0x617/0x637
>>> Oct  3 06:12:19.231185 (XEN) RFLAGS: 0000000000010292   CONTEXT: hypervisor (d15v0)
>>> Oct  3 06:12:19.239163 (XEN) rax: ffff83022dfc802c   rbx: ffff8300ccc65680   rcx: 0000000000000000
>>> Oct  3 06:12:19.247169 (XEN) rdx: ffff83022df7ffff   rsi: 000000000000000a   rdi: ffff82d0804606d8
>>> Oct  3 06:12:19.255127 (XEN) rbp: ffff83022df7ff08   rsp: ffff83022df7fea8   r8:  ffff83022df90000
>>> Oct  3 06:12:19.263114 (XEN) r9:  0000000000000001   r10: 0000000000000000   r11: 0000000000000001
>>> Oct  3 06:12:19.271109 (XEN) r12: 00000000ffffffff   r13: ffff82d0803cfba6   r14: ffff82d0803cfba6
>>> Oct  3 06:12:19.279119 (XEN) r15: 0000000000000004   cr0: 0000000080050033   cr4: 00000000001526e0
>>> Oct  3 06:12:19.279157 (XEN) cr3: 0000000214274000   cr2: 00005622a2184dbf
>>> Oct  3 06:12:19.287123 (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
>>> Oct  3 06:12:19.295105 (XEN) Xen code around <ffff82d0803022a5> (vmx_intr_assist+0x617/0x637):
>>> Oct  3 06:12:19.303150 (XEN)  41 bf 00 00 00 00 eb a0 <0f> 0b 89 ce 48 89 df e8 bb 20 00 00 e9 49 fe ff
>>> Oct  3 06:12:19.311112 (XEN) Xen stack trace from rsp=ffff83022df7fea8:
>>> Oct  3 06:12:19.311146 (XEN)    ffff83022df7ff08 000000388030cf76 ffff82d0805a7570 ffff82d08057ad80
>>> Oct  3 06:12:19.319131 (XEN)    ffff83022df7ffff ffff83022df7fee0 ffff82d08023b9b6 ffff8300ccc65000
>>> Oct  3 06:12:19.327115 (XEN)    000000000000000b 0000000000000020 00000000000000c2 0000000000000004
>>> Oct  3 06:12:19.345094 (XEN)    ffff880029eb4000 ffff82d080311c21 0000000000000004 00000000000000c2
>>> Oct  3 06:12:19.345177 (XEN)    0000000000000020 000000000000000b ffff880029eb4000 ffffffff81adf0a0
>>> Oct  3 06:12:19.351221 (XEN)    0000000000000000 0000000000000000 ffff88002d400008 0000000000000000
>>> Oct  3 06:12:19.359439 (XEN)    0000000000000030 0000000000000000 00000000000003f8 00000000000003f8
>>> Oct  3 06:12:19.367267 (XEN)    ffffffff81adf0a0 0000beef0000beef ffffffff8138a5f4 000000bf0000beef
>>> Oct  3 06:12:19.375222 (XEN)    0000000000000002 ffff88002f803e08 000000000000beef 000000000000beef
>>> Oct  3 06:12:19.383198 (XEN)    000000000000beef 000000000000beef 000000000000beef 0000000000000001
>>> Oct  3 06:12:19.391230 (XEN)    ffff8300ccc65000 00000031ada20d00 00000000001526e0
>>> Oct  3 06:12:19.399336 (XEN) Xen call trace:
>>> Oct  3 06:12:19.399389 (XEN)    [<ffff82d0803022a5>] vmx_intr_assist+0x617/0x637
>>> Oct  3 06:12:19.407337 (XEN)    [<ffff82d080311c21>] vmx_asm_vmexit_handler+0x41/0x120
>>> Oct  3 06:12:19.407380 (XEN) 
>>> Oct  3 06:12:19.415246 (XEN) 
>>> Oct  3 06:12:19.415278 (XEN) ****************************************
>>> Oct  3 06:12:19.415307 (XEN) Panic on CPU 1:
>>> Oct  3 06:12:19.415332 (XEN) Assertion 'intack.vector >= pt_vector' failed at intr.c:367
>>> Oct  3 06:12:19.423432 (XEN) ****************************************
>> (CC Jan)
>>
>> Hi, Roger.
>>
>> I sent a patch to fix a possible cause of this bug, seeing
>> https://lists.xenproject.org/archives/html/xen-devel/2017-04/msg03254.html.
>>
>> Due to Xen 4.9 release, I put this patch aside and later forgot to
>> continue fixing this bug. Sorry for this. Of course, I will fix this
>> bug.
>>
>> I thought the root case was:
>> When injecting periodic timer interrupt in vmx_intr_assist(),
>> multi-read operations are done during one event delivery. For
>> example, if a periodic timer interrupt is from PIT, when set the
>> corresponding bit in vIRR, the corresponding RTE is accessed in
>> pt_update_irq(). When this function returns, it accesses the RTE
>> again to get the vector it sets in vIRR.  Between the two
>> accesses, the content of RTE may have been changed by another CPU
>> for no protection method in use. This case can incur the
>> assertion failure in vmx_intr_assist().
>>
>> For example, in this case, we may set 0x30 in vIRR, but return 0x38 to
>> vmx_intr_assist(). When we try to inject an interrupt, we would find
>> 0x38 is greater than the highest vector; then the assertion failure
>> happened. I have a xtf case to reproduce this bug, seeing
>> https://lists.xenproject.org/archives/html/xen-devel/2017-03/msg02906.html.
>> But according to Jan's opinion, he thought the bug was unlikely
>> triggered in OSSTEST by these weird operations.
>>
>> After thinking over it, the bug also can be caused by pt_update_irq()
>> returns 0x38 but it doesn't set 0x38 in vIRR for the corresponding RTE
>> is masked. Please refer to the code path:
>> vmx_intr_assist() -> pt_update_irq() -> hvm_isa_irq_assert() ->
>> assert_irq() -> assert_gsi() -> vioapic_irq_positive_edge().
>> Note that in vioapic_irq_positive_edge(), if ent->fields.mask is set,
>> the function returns without setting the corresponding bit in vIRR.
> To verify this guess, I modify the above xtf a little. The new xtf test
> (enclosed in attachment) Create a guest with 2 vCPU. vCPU0 sets up PIT
> to generate timer interrupt every 1ms. It also boots up vCPU1. vCPU1
> incessantly masks/unmasks the corresponding IOAPIC RTE and sends IPI
> (vector 0x30) to vCPU0. The bug happens as expected:

On the XTF side of things, I really need to get around to cleaning up my
SMP support work.  There are an increasing number of tests which are
creating ad-hoc APs.

Recently, an APIC driver has been introduced, so you can probably drop
1/3 of that code by using apic_init()/apic_icr_write().  I've also got a
proto IO-APIC driver which I should clean up and upstream.

~Andrew
Chao Gao Oct. 9, 2017, 11:18 a.m. UTC | #2
On Mon, Oct 09, 2017 at 12:03:53PM +0100, Andrew Cooper wrote:
>On 09/10/17 08:58, Chao Gao wrote:
>> On Mon, Oct 09, 2017 at 02:13:22PM +0800, Chao Gao wrote:
>>> On Tue, Oct 03, 2017 at 11:08:01AM +0100, Roger Pau Monné wrote:
>>>> On Tue, Oct 03, 2017 at 09:55:44AM +0000, osstest service owner wrote:
>>>>> flight 113959 xen-unstable real [real]
>>>>> http://logs.test-lab.xenproject.org/osstest/logs/113959/
>>>>>
>>>>> Regressions :-(
>>>>>
>>>>> Tests which did not succeed and are blocking,
>>>>> including tests which could not be run:
>>>>>  test-amd64-i386-libvirt-xsm  21 leak-check/check         fail REGR. vs. 113954
>>>> This is due to cron running when the leak-check is executed.
>>>>
>>>>>  test-armhf-armhf-xl-multivcpu  5 host-ping-check-native  fail REGR. vs. 113954
>>>>>  test-amd64-i386-xl-qemut-debianhvm-amd64 17 guest-stop   fail REGR. vs. 113954
>>>> The test below has triggered the following ASSERT, CCing the Intel
>>>> guys.
>>>>
>>>> Oct  3 06:12:00.415168 (XEN) d15v0: intack: 2:30 pt: 38
>>>> Oct  3 06:12:19.191141 (XEN) vIRR: 00000000 00000000 00000000 00000000 00000000 00000000 00010000 00000000
>>>> Oct  3 06:12:19.199162 (XEN)  PIR: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
>>>> Oct  3 06:12:19.207160 (XEN) Assertion 'intack.vector >= pt_vector' failed at intr.c:367
>>>> Oct  3 06:12:19.215215 (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Not tainted ]----
>>>> Oct  3 06:12:19.223124 (XEN) CPU:    1
>>>> Oct  3 06:12:19.223153 (XEN) RIP:    e008:[<ffff82d0803022a5>] vmx_intr_assist+0x617/0x637
>>>> Oct  3 06:12:19.231185 (XEN) RFLAGS: 0000000000010292   CONTEXT: hypervisor (d15v0)
>>>> Oct  3 06:12:19.239163 (XEN) rax: ffff83022dfc802c   rbx: ffff8300ccc65680   rcx: 0000000000000000
>>>> Oct  3 06:12:19.247169 (XEN) rdx: ffff83022df7ffff   rsi: 000000000000000a   rdi: ffff82d0804606d8
>>>> Oct  3 06:12:19.255127 (XEN) rbp: ffff83022df7ff08   rsp: ffff83022df7fea8   r8:  ffff83022df90000
>>>> Oct  3 06:12:19.263114 (XEN) r9:  0000000000000001   r10: 0000000000000000   r11: 0000000000000001
>>>> Oct  3 06:12:19.271109 (XEN) r12: 00000000ffffffff   r13: ffff82d0803cfba6   r14: ffff82d0803cfba6
>>>> Oct  3 06:12:19.279119 (XEN) r15: 0000000000000004   cr0: 0000000080050033   cr4: 00000000001526e0
>>>> Oct  3 06:12:19.279157 (XEN) cr3: 0000000214274000   cr2: 00005622a2184dbf
>>>> Oct  3 06:12:19.287123 (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
>>>> Oct  3 06:12:19.295105 (XEN) Xen code around <ffff82d0803022a5> (vmx_intr_assist+0x617/0x637):
>>>> Oct  3 06:12:19.303150 (XEN)  41 bf 00 00 00 00 eb a0 <0f> 0b 89 ce 48 89 df e8 bb 20 00 00 e9 49 fe ff
>>>> Oct  3 06:12:19.311112 (XEN) Xen stack trace from rsp=ffff83022df7fea8:
>>>> Oct  3 06:12:19.311146 (XEN)    ffff83022df7ff08 000000388030cf76 ffff82d0805a7570 ffff82d08057ad80
>>>> Oct  3 06:12:19.319131 (XEN)    ffff83022df7ffff ffff83022df7fee0 ffff82d08023b9b6 ffff8300ccc65000
>>>> Oct  3 06:12:19.327115 (XEN)    000000000000000b 0000000000000020 00000000000000c2 0000000000000004
>>>> Oct  3 06:12:19.345094 (XEN)    ffff880029eb4000 ffff82d080311c21 0000000000000004 00000000000000c2
>>>> Oct  3 06:12:19.345177 (XEN)    0000000000000020 000000000000000b ffff880029eb4000 ffffffff81adf0a0
>>>> Oct  3 06:12:19.351221 (XEN)    0000000000000000 0000000000000000 ffff88002d400008 0000000000000000
>>>> Oct  3 06:12:19.359439 (XEN)    0000000000000030 0000000000000000 00000000000003f8 00000000000003f8
>>>> Oct  3 06:12:19.367267 (XEN)    ffffffff81adf0a0 0000beef0000beef ffffffff8138a5f4 000000bf0000beef
>>>> Oct  3 06:12:19.375222 (XEN)    0000000000000002 ffff88002f803e08 000000000000beef 000000000000beef
>>>> Oct  3 06:12:19.383198 (XEN)    000000000000beef 000000000000beef 000000000000beef 0000000000000001
>>>> Oct  3 06:12:19.391230 (XEN)    ffff8300ccc65000 00000031ada20d00 00000000001526e0
>>>> Oct  3 06:12:19.399336 (XEN) Xen call trace:
>>>> Oct  3 06:12:19.399389 (XEN)    [<ffff82d0803022a5>] vmx_intr_assist+0x617/0x637
>>>> Oct  3 06:12:19.407337 (XEN)    [<ffff82d080311c21>] vmx_asm_vmexit_handler+0x41/0x120
>>>> Oct  3 06:12:19.407380 (XEN) 
>>>> Oct  3 06:12:19.415246 (XEN) 
>>>> Oct  3 06:12:19.415278 (XEN) ****************************************
>>>> Oct  3 06:12:19.415307 (XEN) Panic on CPU 1:
>>>> Oct  3 06:12:19.415332 (XEN) Assertion 'intack.vector >= pt_vector' failed at intr.c:367
>>>> Oct  3 06:12:19.423432 (XEN) ****************************************
>>> (CC Jan)
>>>
>>> Hi, Roger.
>>>
>>> I sent a patch to fix a possible cause of this bug, seeing
>>> https://lists.xenproject.org/archives/html/xen-devel/2017-04/msg03254.html.
>>>
>>> Due to Xen 4.9 release, I put this patch aside and later forgot to
>>> continue fixing this bug. Sorry for this. Of course, I will fix this
>>> bug.
>>>
>>> I thought the root case was:
>>> When injecting periodic timer interrupt in vmx_intr_assist(),
>>> multi-read operations are done during one event delivery. For
>>> example, if a periodic timer interrupt is from PIT, when set the
>>> corresponding bit in vIRR, the corresponding RTE is accessed in
>>> pt_update_irq(). When this function returns, it accesses the RTE
>>> again to get the vector it sets in vIRR.  Between the two
>>> accesses, the content of RTE may have been changed by another CPU
>>> for no protection method in use. This case can incur the
>>> assertion failure in vmx_intr_assist().
>>>
>>> For example, in this case, we may set 0x30 in vIRR, but return 0x38 to
>>> vmx_intr_assist(). When we try to inject an interrupt, we would find
>>> 0x38 is greater than the highest vector; then the assertion failure
>>> happened. I have a xtf case to reproduce this bug, seeing
>>> https://lists.xenproject.org/archives/html/xen-devel/2017-03/msg02906.html.
>>> But according to Jan's opinion, he thought the bug was unlikely
>>> triggered in OSSTEST by these weird operations.
>>>
>>> After thinking over it, the bug also can be caused by pt_update_irq()
>>> returns 0x38 but it doesn't set 0x38 in vIRR for the corresponding RTE
>>> is masked. Please refer to the code path:
>>> vmx_intr_assist() -> pt_update_irq() -> hvm_isa_irq_assert() ->
>>> assert_irq() -> assert_gsi() -> vioapic_irq_positive_edge().
>>> Note that in vioapic_irq_positive_edge(), if ent->fields.mask is set,
>>> the function returns without setting the corresponding bit in vIRR.
>> To verify this guess, I modify the above xtf a little. The new xtf test
>> (enclosed in attachment) Create a guest with 2 vCPU. vCPU0 sets up PIT
>> to generate timer interrupt every 1ms. It also boots up vCPU1. vCPU1
>> incessantly masks/unmasks the corresponding IOAPIC RTE and sends IPI
>> (vector 0x30) to vCPU0. The bug happens as expected:
>
>On the XTF side of things, I really need to get around to cleaning up my
>SMP support work.  There are an increasing number of tests which are
>creating ad-hoc APs.
>
>Recently, an APIC driver has been introduced, so you can probably drop
>1/3 of that code by using apic_init()/apic_icr_write().  I've also got a
>proto IO-APIC driver which I should clean up and upstream.

Thanks for your information. I will try to clean up this test and send
it out for review.

Thanks
Chao
diff mbox

Patch

diff --git a/tests/periodic-timer-2/Makefile b/tests/periodic-timer-2/Makefile
new file mode 100644
index 0000000..bc03779
--- /dev/null
+++ b/tests/periodic-timer-2/Makefile
@@ -0,0 +1,11 @@ 
+include $(ROOT)/build/common.mk
+
+NAME      := periodic-timer-2
+CATEGORY  := special
+TEST-ENVS := hvm32
+
+TEST-EXTRA-CFG := extra.cfg.in
+
+obj-perenv += main.o entry.o
+
+include $(ROOT)/build/gen.mk
diff --git a/tests/periodic-timer-2/entry.S b/tests/periodic-timer-2/entry.S
new file mode 100644
index 0000000..8a32f76
--- /dev/null
+++ b/tests/periodic-timer-2/entry.S
@@ -0,0 +1,15 @@ 
+#include <arch/idt.h>
+#include <arch/page.h>
+#include <arch/processor.h>
+#include <arch/segment.h>
+#include <xtf/asm_macros.h>
+#include <arch/msr-index.h>
+
+    .align  16
+    .code32
+
+ENTRY(handle_external_int)
+    SAVE_ALL
+    call pt_interrupt_handler
+    RESTORE_ALL
+    iret
diff --git a/tests/periodic-timer-2/extra.cfg.in b/tests/periodic-timer-2/extra.cfg.in
new file mode 100644
index 0000000..8cfbab9
--- /dev/null
+++ b/tests/periodic-timer-2/extra.cfg.in
@@ -0,0 +1 @@ 
+vcpus=2
diff --git a/tests/periodic-timer-2/main.c b/tests/periodic-timer-2/main.c
new file mode 100644
index 0000000..3801329
--- /dev/null
+++ b/tests/periodic-timer-2/main.c
@@ -0,0 +1,294 @@ 
+/**
+ * @file tests/periodic-timer/main.c
+ * @ref test-periodic-timer
+ *
+ * @page test-periodic-timer periodic-timer
+ *
+ * @todo Docs for test-periodic-timer
+ *
+ * @see tests/periodic-timer/main.c
+ */
+#include <xtf.h>
+#include <arch/barrier.h>
+#include <arch/idt.h>
+#include <xtf/asm_macros.h>
+#include <arch/msr-index.h>
+
+#define COUNTER_FREQ    1193181
+#define MAX_PIT_HZ      COUNTER_FREQ
+#define MIN_PIT_HZ      18
+#define PIT_CTRL_PORT   0x43
+#define PIT_CHANNEL0    0x40
+
+#define IOAPIC_REGSEL   ((uint32_t *)0xfec00000)
+#define IOAPIC_IOWIN    ((uint32_t *)0xfec00010)
+
+#define AP_START_EIP 0x1000UL
+extern char ap_boot_start[], ap_boot_end[];
+
+asm (
+    "    .text                       \n"
+    "    .code16                     \n"
+    "ap_boot_start: .code16          \n"
+    "    mov   %cs,%ax               \n"
+    "    mov   %ax,%ds               \n"
+    "    lgdt  gdt_desr-ap_boot_start\n"
+    "    xor   %ax, %ax              \n"
+    "    inc   %ax                   \n"
+    "    lmsw  %ax                   \n"
+    "    ljmpl $0x08,$1f             \n"
+    "gdt_desr:                       \n"
+    "    .word gdt_end - gdt - 1     \n"
+    "    .long gdt                   \n"
+    "ap_boot_end: .code32            \n"
+    "1:  mov   $0x10,%eax            \n"
+    "    mov   %eax,%ds              \n"
+    "    mov   %eax,%es              \n"
+    "    mov   %eax,%ss              \n"
+    "    movl  $stack_top,%esp       \n"
+    "    movl  %esp,%ebp             \n"
+    "    call  test_ap_main          \n"
+    "1:  hlt                         \n"
+    "    jmp  1b                     \n"
+    "                                \n"
+    "    .align 8                    \n"
+    "gdt:                            \n"
+    "    .quad 0x0000000000000000    \n"
+    "    .quad 0x00cf9a000000ffff    \n" /* 0x08: Flat code segment */
+    "    .quad 0x00cf92000000ffff    \n" /* 0x10: Flat data segment */
+    "gdt_end:                        \n"
+    "                                \n"
+    "    .bss                        \n"
+    "    .align    8                 \n"
+    "stack:                          \n"
+    "    .skip    0x4000             \n"
+    "stack_top:                      \n"
+    "    .text                       \n"
+    );
+
+const char test_title[] = "Test periodic-timer-2";
+
+int init_pit(int freq)
+{
+    uint16_t reload;
+
+    if ( (freq < MIN_PIT_HZ) || (freq > MAX_PIT_HZ) ) 
+        return -1;
+
+    reload = COUNTER_FREQ / freq;
+
+    asm volatile("cli");
+    outb(0x34, PIT_CTRL_PORT);
+    outb(reload & 0xff, PIT_CHANNEL0);
+    outb(reload >> 8, PIT_CHANNEL0);
+    asm volatile("sti");
+    return 0;
+}
+
+struct ioapic_entry {
+    union {
+        struct {
+            uint32_t vector : 8,
+                     dlm    : 3,
+                     dm     : 1,
+                     dls    : 1,
+                     pol    : 1,
+                     irr    : 1,
+                     tri    : 1,
+                     mask   : 1,
+                     rsvd1  : 15;
+            uint32_t rsvd2  : 24,
+                     dest   : 8;
+        } fields;
+        struct {
+            uint32_t lo;
+            uint32_t hi;
+        } bits;
+    };
+} __attribute__ ((packed));
+
+void writel(uint32_t data, uint32_t *addr)
+{
+    *addr = data;
+}
+
+#define readl(data, addr) (data) = *(addr)
+
+int write_IOAPIC_entry(struct ioapic_entry *ent, int pin)
+{
+    asm volatile("cli");
+    writel(0x11 + 2*pin, IOAPIC_REGSEL); 
+    writel(ent->bits.hi, IOAPIC_IOWIN);
+    wmb();
+    writel(0x10 + 2*pin, IOAPIC_REGSEL); 
+    writel(ent->bits.lo, IOAPIC_IOWIN);
+    wmb();
+    asm volatile("sti");
+    return 0;
+}
+
+void handle_external_int(void);
+
+#define rdmsr(msr, val1, val2) \
+    __asm__ __volatile__("rdmsr" \
+            : "=a" (val1), "=d" (val2) \
+            : "c" (msr))
+
+#define wrmsr(msr, val1, val2) \
+    __asm__ __volatile__("wrmsr" \
+            : \
+            : "c" (msr), "a" (val1), "d" (val2))
+
+static inline void wrmsrl(unsigned int msr, uint64_t val)
+{
+    uint32_t lo, hi;
+    lo = (uint32_t)val;
+    hi = (uint32_t)(val >> 32);
+    wrmsr(msr, lo, hi);
+}
+
+#define APIC_BASE_ADDR_MASK 0xfffff000
+#define APIC_BASE_ADDR(a) (a & APIC_BASE_ADDR_MASK)
+#define APIC_BASE_MSR 0x1b
+#define APIC_GLOBAL_ENABLE_MASK 0x800
+#define APIC_EOI 0xB0
+#define APIC_SVR 0xF0
+#define APIC_SOFT_ENABLE_MASK 0x100
+
+uint32_t apic_base_addr;
+
+void enable_lapic(void)
+{
+    uint32_t lo, hi;
+    uint64_t apic_base_msr;
+    uint32_t svr;
+    rdmsr(APIC_BASE_MSR, lo, hi);
+    apic_base_msr = lo | ((uint64_t) hi <<32);
+    apic_base_addr = APIC_BASE_ADDR(apic_base_msr);
+    wrmsrl(APIC_BASE_MSR, apic_base_msr | APIC_GLOBAL_ENABLE_MASK);
+    readl(svr, (uint32_t *)(apic_base_addr + APIC_SVR)); 
+    writel(svr | APIC_SOFT_ENABLE_MASK, (uint32_t *)(apic_base_addr + APIC_SVR));
+}
+
+void ack_APIC_irq(unsigned long apic_base)
+{
+    writel(0, (uint32_t *)(apic_base + APIC_EOI));
+}
+
+uint32_t lapic_read(uint32_t reg)
+{
+    return *(volatile uint32_t *)(apic_base_addr + reg);
+}
+
+void lapic_write(uint32_t reg, uint32_t val)
+{
+    *(volatile uint32_t *)(apic_base_addr + reg) = val;
+}
+
+#define APIC_ICR 0x300
+#define APIC_ICR2 0x310
+#define APIC_ICR_BUSY 0x01000
+#define APIC_DM_INIT 0x500
+#define APIC_DM_STARTUP 0x600
+
+static inline void cpu_relax(void)
+{
+    asm volatile ( "rep; nop" ::: "memory" );
+}
+
+static void lapic_wait_ready(void)
+{
+    while ( lapic_read(APIC_ICR) & APIC_ICR_BUSY )
+        cpu_relax();
+}
+
+void pt_interrupt_handler(void)
+{
+    ack_APIC_irq(apic_base_addr);
+}
+
+static void boot_cpu(int cpu)
+{
+    unsigned int icr2 = (cpu * 2) << 24;
+    lapic_wait_ready();
+    lapic_write(APIC_ICR2, icr2);
+    lapic_write(APIC_ICR, APIC_DM_INIT);
+    lapic_wait_ready();
+    lapic_write(APIC_ICR2, icr2);
+    lapic_write(APIC_ICR, APIC_DM_STARTUP | (AP_START_EIP >> 12));
+    lapic_wait_ready();
+    lapic_write(APIC_ICR2, icr2);
+    lapic_write(APIC_ICR, APIC_DM_STARTUP | (AP_START_EIP >> 12));
+    lapic_wait_ready();
+}
+
+static void send_ipi(int cpu, uint8_t vector)
+{
+    unsigned int icr2 = (cpu * 2) << 24;
+    lapic_wait_ready();
+    lapic_write(APIC_ICR2, icr2);
+    lapic_write(APIC_ICR, vector);
+    lapic_wait_ready();
+}
+
+void smp_initialize(void)
+{
+    memcpy((void*)AP_START_EIP, ap_boot_start,
+           ap_boot_end - ap_boot_start); 
+    boot_cpu(1);
+}
+
+struct ioapic_entry ent;
+void test_main(void)
+{
+    struct xtf_idte idte = 
+    {
+        .addr = (unsigned long)handle_external_int,
+        .cs = __KERN_CS,
+        .dpl = 0,
+    };
+
+    /* setup idt entry */
+    xtf_set_idte(0x30, &idte);
+    xtf_set_idte(0x38, &idte);
+
+    //asm volatile(".byte 0xcd,0x30\n");
+
+    memset(&ent, 0, sizeof(ent));
+    ent.fields.vector = 0x38;
+    write_IOAPIC_entry(&ent, 2);
+    enable_lapic();
+
+    printk("activate cpu1\n");
+    smp_initialize();
+    init_pit(1000);
+
+    while (1)
+        cpu_relax();
+
+    xtf_success(NULL);
+}
+
+void test_ap_main(void)
+{
+    struct ioapic_entry ent2;
+    memcpy(&ent2, &ent, sizeof(ent2));
+    ent2.fields.vector = 0x38;
+    ent2.fields.mask = 1;
+    while (1)
+    {
+        write_IOAPIC_entry(&ent2, 2);
+        send_ipi(0, 0x30);
+        write_IOAPIC_entry(&ent, 2);
+    }
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */