diff mbox

Nested virtualization off VMware vSphere 6.0 with EL6 guests crashes on Xen 4.6

Message ID 20160204183647.GA7205@char.us.oracle.com
State New, archived
Headers show

Commit Message

Konrad Rzeszutek Wilk Feb. 4, 2016, 6:36 p.m. UTC
On Wed, Feb 03, 2016 at 10:07:27AM -0500, Konrad Rzeszutek Wilk wrote:
> On Wed, Feb 03, 2016 at 02:34:47AM -0700, Jan Beulich wrote:
> > >>> On 02.02.16 at 23:05, <konrad.wilk@oracle.com> wrote:
> > > This is getting more and more bizzare.
> > > 
> > > I realized that this machine has VMCS shadowing so Xen does not trap on
> > > any vmwrite or vmread. Unless I update the VMCS shadowing bitmap - which
> > > I did for vmwrite and vmread to get a better view of this. It never
> > > traps on VIRTUAL_APIC_PAGE_ADDR accesses. It does trap on: 
> > > VIRTUAL_PROCESSOR_ID,
> > > VM_EXIT_MSR_LOAD_ADDR and GUEST_[ES,DS,FS,GS,TR]_SELECTORS.
> > > 
> > > (It may also trap on IO_BITMAP_A,B but I didn't print that out).
> > > 
> > > To confirm that the VMCS that will be given to the L2 guest is correct
> > > I added some printking of some states that ought to be pretty OK such
> > > as HOST_RIP or HOST_RSP - which are all 0!
> > 
> > But did you also check what the field of interest starts out as?
> 
> I will do that.

Attached is the patch against staging (I had used 4.6 before as the only change
between those two was the dynamic mapping/unmapping of the vmread/vmwrite bitmap).

(d1) 
(d1) drive 0x000f6270: PCHS=16383/16/63 translation=lba LCHS=1024/255/63 s=524288000
(d1) 
(d1) Space available for UMB: cb800-ed000, f5d30-f6270
(d1) Returned 258048 bytes of ZoneHigh
(d1) e820 map has 7 items:
(d1)   0: 0000000000000000 - 000000000009fc00 = 1 RAM
(d1)   1: 000000000009fc00 - 00000000000a0000 = 2 RESERVED
(d1)   2: 00000000000f0000 - 0000000000100000 = 2 RESERVED
(d1)   3: 0000000000100000 - 00000000effff000 = 1 RAM
(d1)   4: 00000000effff000 - 00000000f0000000 = 2 RESERVED
(d1)   5: 00000000fc000000 - 0000000100000000 = 2 RESERVED
(d1)   6: 0000000100000000 - 000000020f800000 = 1 RAM
(d1) enter handle_19:
(d1)   NULL
(d1) Booting from Hard Disk...
(d1) Booting from 0000:7c00
(XEN) stdvga.c:178:d1v0 leaving stdvga mode
(XEN) stdvga.c:173:d1v0 entering stdvga mode
(XEN) nvmx_handle_vmwrite 1: IO_BITMAP_A(2000)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 0: IO_BITMAP_A(2000)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 1: IO_BITMAP_B(2002)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 2: IO_BITMAP_A(2000)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 1: VIRTUAL_APIC_PAGE_ADDR(2012)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 2: IO_BITMAP_B(2002)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 1: (2006)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 2: VIRTUAL_APIC_PAGE_ADDR(2012)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 1: VM_EXIT_MSR_LOAD_ADDR(2008)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 3: IO_BITMAP_A(2000)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 3: IO_BITMAP_B(2002)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 2: MSR_BITMAP(2004)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 1: MSR_BITMAP(2004)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 0: MSR_BITMAP(2004)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 3: (2006)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 3: VM_EXIT_MSR_LOAD_ADDR(2008)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 3: MSR_BITMAP(2004)[0=ffffffffffffffff]
(XEN) nvmx_handle_vmwrite 1: VIRTUAL_PROCESSOR_ID(0)[0=9]
(XEN) nvmx_handle_vmwrite 0: VIRTUAL_PROCESSOR_ID(0)[0=9]
(XEN) nvmx_handle_vmwrite 1: MSR_BITMAP(2004)[ffffffffffffffff=1367ed000]
(XEN) nvmx_handle_vmwrite 3: VIRTUAL_PROCESSOR_ID(0)[0=9]
(XEN) nvmx_handle_vmwrite 0: MSR_BITMAP(2004)[ffffffffffffffff=1367ed000]
(XEN) nvmx_handle_vmwrite 1: VM_EXIT_MSR_LOAD_ADDR(2008)[ffffffffffffffff=135639f40]
(XEN) nvmx_handle_vmwrite 0: VM_EXIT_MSR_LOAD_ADDR(2008)[ffffffffffffffff=135666f40]
(XEN) nvmx_handle_vmwrite 2: VIRTUAL_PROCESSOR_ID(0)[0=9]
(XEN) nvmx_handle_vmwrite 3: MSR_BITMAP(2004)[ffffffffffffffff=1367ed000]
(XEN) nvmx_handle_vmwrite 3: VM_EXIT_MSR_LOAD_ADDR(2008)[ffffffffffffffff=135693f40]
(XEN) nvmx_handle_vmwrite 2: MSR_BITMAP(2004)[ffffffffffffffff=1367ed000]
(XEN) nvmx_handle_vmwrite 2: VM_EXIT_MSR_LOAD_ADDR(2008)[ffffffffffffffff=135701f40]
(XEN) nvmx_handle_vmwrite 3: VM_EXIT_MSR_LOAD_ADDR(2008)[135639f40=13763cf40]
(XEN) nvmx_handle_vmwrite 1: VM_EXIT_MSR_LOAD_ADDR(2008)[135701f40=137a3cf40]
(XEN) nvmx_handle_vmwrite 0: VM_EXIT_MSR_LOAD_ADDR(2008)[135693f40=13783cf40]
(XEN) nvmx_handle_vmwrite 2: VM_EXIT_MSR_LOAD_ADDR(2008)[135666f40=137c3cf40]
(XEN) nvmx_handle_vmwrite 3: (800)[0=0]
(XEN) nvmx_handle_vmwrite 3: (804)[0=0]
(XEN) nvmx_handle_vmwrite 3: (806)[0=0]
(XEN) nvmx_handle_vmwrite 3: (80a)[0=0]
(XEN) nvmx_handle_vmwrite 3: (80e)[0=0]
(XEN) vvmx.c:2566:d1v3 Unknown nested vmexit reason 80000021.
(XEN) Failed vm entry (exit reason 0x80000021) caused by invalid guest state (4).
(XEN) ************* VMCS Area **************
(XEN) *** Guest State ***
(XEN) CR0: actual=0x0000000000000030, shadow=0x0000000000000000, gh_mask=ffffffffffffffff
(XEN) CR4: actual=0x0000000000002050, shadow=0x0000000000000000, gh_mask=ffffffffffffffff
(XEN) CR3 = 0x0000000080c06000
(XEN) RSP = 0x0000000000000000 (0x0000000000000000)  RIP = 0x0000000000000000 (0x0000000000000000)
(XEN) RFLAGS=0x00000002 (0x00000002)  DR7 = 0x0000000000000400
(XEN) Sysenter RSP=0000000000000000 CS:RIP=0000:0000000000000000
(XEN)        sel  attr  limit   base
(XEN)   CS: 0000 00000 00000000 0000000000000000
(XEN)   DS: 0000 00000 00000000 0000000000000000
(XEN)   SS: 0000 00000 00000000 0000000000000000
(XEN)   ES: 0000 00000 00000000 0000000000000000
(XEN)   FS: 0000 00000 00000000 0000000000000000
(XEN)   GS: 0000 00000 00000000 0000000000000000
(XEN) GDTR:            00000000 0000000000000000
(XEN) LDTR: 0000 00000 00000000 0000000000000000
(XEN) IDTR:            00000000 0000000000000000
(XEN)   TR: 0000 00000 00000000 0000000000000000
(XEN) EFER = 0x0000000000000800  PAT = 0x0000000000000000
(XEN) PreemptionTimer = 0x00000000  SM Base = 0x00000000
(XEN) DebugCtl = 0x0000000000000000  DebugExceptions = 0x0000000000000000
(XEN) Interruptibility = 00000000  ActivityState = 00000000
(XEN) VIRTUAL_APIC_PAGE_ADDR = 0x0000000000000000 TPR threshold = 0x0000000000000000
(XEN) APIC_ACCESS_ADDR = 0x0000000000000000
(XEN) *** Host State ***
(XEN) RIP = 0xffff82d0801f8f80 (vmx_asm_vmexit_handler)  RSP = 0xffff834007897f90
(XEN) CS=e008 SS=0000 DS=0000 ES=0000 FS=0000 GS=0000 TR=e040
(XEN) FSBase=0000000000000000 GSBase=0000000000000000 TRBase=ffff83400789eb80
(XEN) GDTBase=ffff83400788f000 IDTBase=ffff83400789b000
(XEN) CR0=0000000080050033 CR3=00000040007a0000 CR4=00000000001526e0
(XEN) Sysenter RSP=ffff834007897fc0 CS:RIP=e008:ffff82d08023eb30
(XEN) EFER = 0x0000000000000000  PAT = 0x0000050100070406
(XEN) *** Control State ***
(XEN) PinBased=0000003f CPUBased=b62065fa SecondaryExec=000054eb
(XEN) EntryControls=000011fb ExitControls=001fefff
(XEN) ExceptionBitmap=00060042 PFECmask=00000000 PFECmatch=00000000
(XEN) VMEntry: intr_info=00000000 errcode=00000000 ilen=00000000
(XEN) VMExit: intr_info=00000000 errcode=00000000 ilen=00000006
(XEN)         reason=80000021 qualification=0000000000000004
(XEN) IDTVectoring: info=00000000 errcode=00000000
(XEN) TSC Offset = 0xffef355833aa1cd5
(XEN) TPR Threshold = 0x00  PostedIntrVec = 0x00
(XEN) EPT pointer = 0x00000040007d101e  EPTP index = 0x0000
(XEN) PLE Gap=00000080 Window=00001000
(XEN) Virtual processor ID = 0x0050 VMfunc controls = 0000000000000000
(XEN) **************************************
(XEN) domain_crash called from vmx.c:2845
(XEN) Domain 1 (vcpu#3) crashed on cpu#54:
(XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
(XEN) CPU:    54
(XEN) RIP:    0000:[<0000000000000000>]
(XEN) RFLAGS: 0000000000000002   CONTEXT: hvm guest (d1v3)
(XEN) rax: 0000000000000000   rbx: 0000000000000000   rcx: 0000000000000000
(XEN) rdx: 00000000078bfbff   rsi: 0000000000000000   rdi: 0000000000000000
(XEN) rbp: 0000000000000000   rsp: 0000000000000000   r8:  0000000000000000
(XEN) r9:  0000000000000000   r10: 0000000000000000   r11: 0000000000000000
(XEN) r12: 0000000000000000   r13: 0000000000000000   r14: 0000000000000000
(XEN) r15: 0000000000000000   cr0: 0000000000000010   cr4: 0000000000000000
(XEN) cr3: 0000000080c06000   cr2: 0000000000000000
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: 0000

I am going to augment more of the tracing to get an idea of what is happening
before this.

Comments

Jan Beulich Feb. 5, 2016, 10:33 a.m. UTC | #1
>>> On 04.02.16 at 19:36, <konrad.wilk@oracle.com> wrote:
> (XEN) nvmx_handle_vmwrite 1: IO_BITMAP_A(2000)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 0: IO_BITMAP_A(2000)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 1: IO_BITMAP_B(2002)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 2: IO_BITMAP_A(2000)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 1: VIRTUAL_APIC_PAGE_ADDR(2012)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 2: IO_BITMAP_B(2002)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 1: (2006)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 2: VIRTUAL_APIC_PAGE_ADDR(2012)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 1: VM_EXIT_MSR_LOAD_ADDR(2008)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 3: IO_BITMAP_A(2000)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 3: IO_BITMAP_B(2002)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 2: MSR_BITMAP(2004)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 1: MSR_BITMAP(2004)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 0: MSR_BITMAP(2004)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 3: (2006)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 3: VM_EXIT_MSR_LOAD_ADDR(2008)[0=ffffffffffffffff]
> (XEN) nvmx_handle_vmwrite 3: MSR_BITMAP(2004)[0=ffffffffffffffff]

So there's a whole lot of "interesting" writes of all ones, and indeed
VIRTUAL_APIC_PAGE_ADDR is among them, and the code doesn't
handle that case (nor the equivalent for APIC_ACCESS_ADDR).
What's odd though is that the writes are for vCPU 1 and 2, while
the crash is on vCPU 3 (it would of course help if the guest had as
few vCPU-s as possible without making the issue disappear). While
you have circumvented the ASSERT() you've originally hit, the log
messages you've added there don't appear anywhere, which is
clearly confusing, so I wonder what other unintended effects your
debugging code has (there's clearly an uninitialized variable issue
in your additions to vmx_vmexit_handler(), but that shouldn't
matter here, albeit it should have cause build failure, making me
suspect the patch to be stale).

Oddly enough the various bitmap field VMWRITEs above should all
fail, yet the guest appears to recover from (ignore?) these
failures. (From all I can tell we're prone to NULL dereferences due
to that at least in _shadow_io_bitmap().)

> (XEN) Failed vm entry (exit reason 0x80000021) caused by invalid guest state (4).

4 means invalid VMCS link pointer - interesting.

Jan
Konrad Rzeszutek Wilk Nov. 3, 2016, 1:41 a.m. UTC | #2
On Fri, Feb 05, 2016 at 03:33:44AM -0700, Jan Beulich wrote:
> >>> On 04.02.16 at 19:36, <konrad.wilk@oracle.com> wrote:
> > (XEN) nvmx_handle_vmwrite 1: IO_BITMAP_A(2000)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 0: IO_BITMAP_A(2000)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 1: IO_BITMAP_B(2002)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 2: IO_BITMAP_A(2000)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 1: VIRTUAL_APIC_PAGE_ADDR(2012)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 2: IO_BITMAP_B(2002)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 1: (2006)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 2: VIRTUAL_APIC_PAGE_ADDR(2012)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 1: VM_EXIT_MSR_LOAD_ADDR(2008)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 3: IO_BITMAP_A(2000)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 3: IO_BITMAP_B(2002)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 2: MSR_BITMAP(2004)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 1: MSR_BITMAP(2004)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 0: MSR_BITMAP(2004)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 3: (2006)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 3: VM_EXIT_MSR_LOAD_ADDR(2008)[0=ffffffffffffffff]
> > (XEN) nvmx_handle_vmwrite 3: MSR_BITMAP(2004)[0=ffffffffffffffff]
> 
> So there's a whole lot of "interesting" writes of all ones, and indeed
> VIRTUAL_APIC_PAGE_ADDR is among them, and the code doesn't
> handle that case (nor the equivalent for APIC_ACCESS_ADDR).
> What's odd though is that the writes are for vCPU 1 and 2, while
> the crash is on vCPU 3 (it would of course help if the guest had as
> few vCPU-s as possible without making the issue disappear). While
> you have circumvented the ASSERT() you've originally hit, the log
> messages you've added there don't appear anywhere, which is
> clearly confusing, so I wonder what other unintended effects your
> debugging code has (there's clearly an uninitialized variable issue
> in your additions to vmx_vmexit_handler(), but that shouldn't
> matter here, albeit it should have cause build failure, making me
> suspect the patch to be stale).
> 
> Oddly enough the various bitmap field VMWRITEs above should all
> fail, yet the guest appears to recover from (ignore?) these
> failures. (From all I can tell we're prone to NULL dereferences due
> to that at least in _shadow_io_bitmap().)
> 
> > (XEN) Failed vm entry (exit reason 0x80000021) caused by invalid guest state (4).
> 
> 4 means invalid VMCS link pointer - interesting.
>

Hey Jan,

I hadn't been able to look at this for a quite while. A couple of folks have
showed interest in looking at this, CC-ing them.

For folks that are new, it may also be worth looking at:
http://www.gossamer-threads.com/lists/xen/devel/413285?page=last
which has the full thread.

Here are also the instructions on how to reproduce it:
(This Xen 4.7 'staging-4.7')

2) Download VMWare ESX and install it:
[root@localhost ~]# more vmware.xm
memory=8192
maxvcpus = 4
name = "VMWARE"
vif = [ 'mac=00:0f:4b:00:00:85,bridge=switch,model=e1000' ]
disk= ['phy:/dev/nested_guests/VMWare_ESX,hda,w']
#,'file:/mnt/iso/VMware-VMvisor-Installer-6.0.0.update02-3620759.x86_64.iso,hdc:cdrom,r']
#boot="dn"
kernel = "/usr/lib/xen/boot/hvmloader"
builder='hvm'
serial='pty'
vcpus = 4
vnc=1
vnclisten="0.0.0.0"
usb=1
nestedhvm=1

3) Let the guest be installed - once it has rebooted.
4) Enable SSH on the VMWare ESX,  Press F2 on guest console, login, select
'Troubleshooting Options', Enter 'Enable ESXi Shell' and 'Enable SSH'
5). Create a guest using VMWare ESXi Client (you need to use Windows for
that). I picked the simplest option and went ahead with FreeBSD (you can also do Linux).
6) To download the guest in VMWare you can SSH in the ESXi:
#cd /vmfs/volumes/datastore1
#wget http://ftp.freebsd.org/pub/FreeBSD/releases/ISO-IMAGES/11.0/FreeBSD-11.0-RELEASE-amd64-disc1.iso

7) In the VMWare ESXI client hook up the 'CD' to the ISO.

8). Hit Start and get greeted with:  You are running VMware ESX through an
incompatible hypervisor. You cannot power on a virtual machine until this
hypervisor is disabled". Go to
https://kb.vmware.com/selfservice/microsites/search.do?language=en_US&cmd=displayKC&externalId=2108724

which is just editing the .vmx file with an attribute, so login back in the
VMWare ESXi and:
[root@g-osstest:~] vi `find / -name *.vmx`

and add:
vmx.allowNested="TRUE"      

9) Start the guest up again in VMWare and be greeted with that splash screen. 

 (XEN) ----[ Xen-4.7.1-pre  x86_64  debug=n  Not tainted ]----
(XEN) CPU:    2
(XEN) RIP:    e008:[<ffff82d08027cd71>] put_page+0x1/0xd0
(XEN) RFLAGS: 0000000000010202   CONTEXT: hypervisor (d1v0)
(XEN) rax: 0000000000002012   rbx: ffff84802eddb000   rcx: 557f000000000000
(XEN) rdx: 555557f000000000   rsi: 000ffffff8000000   rdi: 0000000000000000
(XEN) rbp: 0000000000000000   rsp: ffff8488bf06fe98   r8:  0000000000000000
(XEN) r9:  0000000000000000   r10: 0000000000000014   r11: 0000000000000000
(XEN) r12: ffff8488bf06ff18   r13: 0000000000000d00   r14: 0000000000000000
(XEN) r15: ffff8488464c4000   cr0: 0000000080050033   cr4: 00000000003526e0
(XEN) cr3: 00000008464ab000   cr2: 0000000000000010
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
(XEN) Xen code around <ffff82d08027cd71> (put_page+0x1/0xd0):
(XEN)  ff 66 0f 1f 44 00 00 53 <48> 8b 57 10 48 8d 77 10 48 89 fb 48 8d 4a ff 48
(XEN) Xen stack trace from rsp=ffff8488bf06fe98:
(XEN)    ffff84802eddb000 ffff82d0802faba1 ffff8488bf06ff18 ffff82d0802f4af9
(XEN)    ffff8488bf06ff18 0000000000000000 ffff82d080613200 00000004802fb799
(XEN)    ffff8488bf06ff18 ffff84802eddb000 0000000000000000 0000000000000000
(XEN)    0000000000000000 0000000000000000 0000000000000000 ffff82d0802fc186
(XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
(XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
(XEN)    0000000000000000 0000000000000000 0000000000000000 0000000000000000
(XEN)    00000000078bfbff 0000000000000000 0000000000000000 000200fa00000000
(XEN)    fffffffffc4b3420 0000000000000000 0000000000040046 fffffffffc607f00
(XEN)    0000000000000000 f9034c0b7e7cdfdf b113252c5e7bdbbf 31379d5bde78bbdd
(XEN)    7043c568fe7a51b5 384b956900000002 ffff84802eddb000 000001b83ea37880
(XEN)    00000000003526e0
(XEN) Xen call trace:
(XEN)    [<ffff82d08027cd71>] put_page+0x1/0xd0
(XEN)    [<ffff82d0802faba1>] vvmx.c#virtual_vmentry+0x261/0xdf0
(XEN)    [<ffff82d0802f4af9>] vmx_vmenter_helper+0xd9/0x260
(XEN)    [<ffff82d0802fc186>] vmx_asm_vmexit_handler+0x46/0x110
(XEN) 
(XEN) Pagetable walk from 0000000000000010:
(XEN)  L4[0x000] = 0000000000000000 ffffffffffffffff
(XEN) 
(XEN) ****************************************
(XEN) Panic on CPU 2:
(XEN) FATAL PAGE FAULT
(XEN) [error_code=0000]
(XEN) Faulting linear address: 0000000000000010
(XEN) ****************************************
(XEN) 
(XEN) Reboot in five seconds...
Konrad Rzeszutek Wilk Nov. 3, 2016, 2:36 p.m. UTC | #3
.. snip..
>>
>> > (XEN) Failed vm entry (exit reason 0x80000021) caused by invalid guest state (4).
>>
>> 4 means invalid VMCS link pointer - interesting.
>>
>
> Hey Jan,
>
> I hadn't been able to look at this for a quite while. A couple of folks have
> showed interest in looking at this, CC-ing them.
>

and Matt (CC-ed) had been able to debug this a bit further as well:

" I tracked this down to incorrect Xen emulation of  VMWRITE, VMPTRLD,
VMLAUNCH, and VMRESUME, in which Xen is failing the
operation if the provided address can't be mapped. A L1 VMM should be
allowed to write whatever garbage it wants into VMCS. The value may
not even be used depending on other control fields.

Xen also shouldn't be setting RFLAGS.CF (VMfailInvalid) for any
condition other than an invalid VMCS-link pointer (it was setting
RFLAGS.CF when it couldn't map the bitmap pages)."

And in retrospective it makes sense that VMWare writes garbage in
VMCS - it is probably using the binary translation part at that point.
diff mbox

Patch

diff --git a/tools/xentrace/formats b/tools/xentrace/formats
index 5d7b72a..d44f2fa 100644
--- a/tools/xentrace/formats
+++ b/tools/xentrace/formats
@@ -42,8 +42,8 @@ 
 0x00081401  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  nVMENTRY
 0x00081402  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  nVMEXIT     [ exitcode = 0x%(1)08x, rIP  = 0x%(2)08x ]
 0x00081502  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  nVMEXIT     [ exitcode = 0x%(1)08x, rIP  = 0x%(3)08x%(2)08x ]
-0x00082001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ errorcode = 0x%(2)02x, virt = 0x%(1)08x ]
-0x00082101  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ errorcode = 0x%(3)02x, virt = 0x%(2)08x%(1)08x ]
+0x00082401  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  nL1         [ errorcode = 0x%(2)02x, virt = 0x%(1)08x ]
+0x00082501  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  nL1         [ errorcode = 0x%(3)02x, virt = 0x%(2)08x%(1)08x ]
 0x00082002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_INJECT   [ errorcode = 0x%(1)02x, virt = 0x%(2)08x ]
 0x00082102  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_INJECT   [ errorcode = 0x%(1)02x, virt = 0x%(3)08x%(2)08x ]
 0x00082003  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INJ_EXC     [ vector = 0x%(1)02x, errorcode = 0x%(2)04x ]
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index 5bc3c74..964ef01 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1825,6 +1825,12 @@  void vmcs_dump_vcpu(struct vcpu *v)
          SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY )
         printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS));
 
+    if ( cpu_has_vmx_tpr_shadow )
+        printk("VIRTUAL_APIC_PAGE_ADDR = 0x%016lx TPR threshold = 0x%016lx\n",
+               vmr(VIRTUAL_APIC_PAGE_ADDR), vmr(TPR_THRESHOLD));
+    if ( cpu_has_vmx_virtualize_apic_accesses )
+        printk("APIC_ACCESS_ADDR = 0x%016lx\n",
+               vmr(APIC_ACCESS_ADDR));
     printk("*** Host State ***\n");
     printk("RIP = 0x%016lx (%ps)  RSP = 0x%016lx\n",
            vmr(HOST_RIP), (void *)vmr(HOST_RIP), vmr(HOST_RSP));
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 04dde83..565af29 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2970,6 +2970,7 @@  void vmx_vmexit_handler(struct cpu_user_regs *regs)
     unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0;
     unsigned int vector = 0;
     struct vcpu *v = current;
+    bool_t vcpu_guestmode;
 
     __vmread(GUEST_RIP,    &regs->rip);
     __vmread(GUEST_RSP,    &regs->rsp);
@@ -2986,12 +2987,17 @@  void vmx_vmexit_handler(struct cpu_user_regs *regs)
 
     __vmread(VM_EXIT_REASON, &exit_reason);
 
+    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
+        vcpu_guestmode = 1;
+
     if ( hvm_long_mode_enabled(v) )
-        HVMTRACE_ND(VMEXIT64, 0, 1/*cycles*/, 3, exit_reason,
+        HVMTRACE_ND(VMEXIT64, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0,
+                    1/*cycles*/, 3, exit_reason,
                     (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
                     0, 0, 0);
     else
-        HVMTRACE_ND(VMEXIT, 0, 1/*cycles*/, 2, exit_reason,
+        HVMTRACE_ND(VMEXIT, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0,
+                    1/*cycles*/, 2, exit_reason,
                     (uint32_t)regs->eip, 
                     0, 0, 0, 0);
 
@@ -3069,7 +3075,19 @@  void vmx_vmexit_handler(struct cpu_user_regs *regs)
     {
         paging_update_nestedmode(v);
         if ( nvmx_n2_vmexit_handler(regs, exit_reason) )
+        {
+            if ( hvm_long_mode_enabled(v) )
+                HVMTRACE_ND(PF_XEN64, TRC_HVM_NESTEDFLAG ,
+                    1/*cycles*/, 3, exit_reason,
+                    (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
+                    0, 0, 0);
+            else
+                HVMTRACE_ND(PF_XEN, TRC_HVM_NESTEDFLAG,
+                    1/*cycles*/, 2, exit_reason,
+                    (uint32_t)regs->eip, 
+                    0, 0, 0, 0);
             goto out;
+        }
     }
 
     if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index 271ec70..974e89f 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -101,7 +101,14 @@  int nvmx_vcpu_initialise(struct vcpu *v)
         set_bit(IO_BITMAP_B, vw);
         set_bit(VMCS_HIGH(IO_BITMAP_B), vw);
 
+        set_bit(VIRTUAL_APIC_PAGE_ADDR, vw);
+
+        unmap_domain_page(vw);
+
+        vw = __map_domain_page(vmread_bitmap);
+        set_bit(VIRTUAL_APIC_PAGE_ADDR, vw);
         unmap_domain_page(vw);
+
     }
 
     nvmx->ept.enabled = 0;
@@ -692,12 +699,40 @@  static void nvmx_update_virtual_apic_address(struct vcpu *v)
         p2m_type_t p2mt;
         unsigned long vapic_gpfn;
         struct page_info *vapic_pg;
+        u32 sec_ctrl = __n2_secondary_exec_control(v);
+
+        vapic_gpfn = __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR);
 
-        vapic_gpfn = __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR) >> PAGE_SHIFT;
-        vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn, &p2mt, P2M_ALLOC);
-        ASSERT(vapic_pg && !p2m_is_paging(p2mt));
-        __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg));
-        put_page(vapic_pg);
+        vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn >> PAGE_SHIFT, &p2mt, P2M_ALLOC);
+        if ( !vapic_pg )
+        {
+            unsigned int i;
+            printk("%s: vCPU%d 0x%lx(vAPIC) 0x%lx(APIC), 0x%lx(TPR) ctrl=%x sec=%x %s%s%s%s\n",
+                   __func__,v->vcpu_id,
+                    vapic_gpfn,
+                    __get_vvmcs(nvcpu->nv_vvmcx, APIC_ACCESS_ADDR),
+                    __get_vvmcs(nvcpu->nv_vvmcx, TPR_THRESHOLD),
+                    ctrl, sec_ctrl,
+                    sec_ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES ? "APIC" : "",
+                    sec_ctrl & SECONDARY_EXEC_APIC_REGISTER_VIRT ? "APIC virt" : "",
+                    sec_ctrl & SECONDARY_EXEC_ENABLE_VMCS_SHADOWING ? "VMCS shadow" : "",
+                    sec_ctrl & SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS ? "v excpt" : "");
+            for (i = 0; i < 3; i++)
+                printk("%s: %s = 0x%lx updated %ld.\n", __func__,
+                       i == 0 ? "TPR threshold" :
+                        (i == 1 ? "Virtual APIC" :
+                            (i == 2 ? "APIC address" : "" )),
+                        nvcpu->debug[i], nvcpu->debug_cnt[i]);
+
+            printk("HOST_RIP=0x%lx HOST_RSP=0x%lx\n", __get_vvmcs(nvcpu->nv_vvmcx, HOST_RIP),
+                    __get_vvmcs(nvcpu->nv_vvmcx, HOST_RSP));
+            __vmwrite(VIRTUAL_APIC_PAGE_ADDR, vapic_gpfn);
+        } else {
+            ASSERT(vapic_pg);
+            ASSERT(vapic_pg && !p2m_is_paging(p2mt));
+            __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg));
+            put_page(vapic_pg);
+        }
     }
     else
         __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0);
@@ -1729,12 +1764,17 @@  int nvmx_handle_vmread(struct cpu_user_regs *regs)
     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
     u64 value = 0;
     int rc;
+    unsigned long operand;
 
     rc = decode_vmx_inst(regs, &decode, NULL, 0);
     if ( rc != X86EMUL_OKAY )
         return rc;
 
-    value = __get_vvmcs(nvcpu->nv_vvmcx, reg_read(regs, decode.reg2));
+    operand = reg_read(regs, decode.reg2);
+    value = __get_vvmcs(nvcpu->nv_vvmcx, operand);
+
+    if ( operand == VIRTUAL_APIC_PAGE_ADDR )
+        printk("%s: val=%lx\n", __func__, value);
 
     switch ( decode.type ) {
     case VMX_INST_MEMREG_TYPE_MEMORY:
@@ -1756,29 +1796,62 @@  int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
     struct vcpu *v = current;
     struct vmx_inst_decoded decode;
     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
-    unsigned long operand; 
+    unsigned long operand, val;
     u64 vmcs_encoding;
     bool_t okay = 1;
+    char *msg = NULL;
 
     if ( decode_vmx_inst(regs, &decode, &operand, 0)
              != X86EMUL_OKAY )
         return X86EMUL_EXCEPTION;
 
     vmcs_encoding = reg_read(regs, decode.reg2);
+    val = __get_vvmcs(nvcpu->nv_vvmcx, vmcs_encoding);
     __set_vvmcs(nvcpu->nv_vvmcx, vmcs_encoding, operand);
 
     switch ( vmcs_encoding & ~VMCS_HIGH(0) )
     {
     case IO_BITMAP_A:
         okay = _map_io_bitmap(v, IO_BITMAP_A);
+        msg="IO_BITMAP_A";
         break;
     case IO_BITMAP_B:
         okay = _map_io_bitmap(v, IO_BITMAP_B);
+        msg="IO_BITMAP_B";
         break;
     case MSR_BITMAP:
+        msg="MSR_BITMAP";
         okay = _map_msr_bitmap(v);
         break;
+    case TPR_THRESHOLD:
+        msg="TPR_THRESHOLD";
+        nvcpu->debug[0] = operand;
+        nvcpu->debug_cnt[0] ++;
+        okay = 1;
+        break;
+    case VIRTUAL_APIC_PAGE_ADDR:
+        msg="VIRTUAL_APIC_PAGE_ADDR";
+        nvcpu->debug[1] = operand;
+        nvcpu->debug_cnt[1] ++;
+        okay = 1;
+        break;
+    case APIC_ACCESS_ADDR:
+        msg="APIC_ACCESS_ADDR";
+        nvcpu->debug[2] = operand;
+        nvcpu->debug_cnt[1] ++;
+        okay = 1;
+        break;
+    case VM_EXIT_MSR_LOAD_ADDR:
+        msg="VM_EXIT_MSR_LOAD_ADDR";
+        break;
+    case VIRTUAL_PROCESSOR_ID:
+        msg="VIRTUAL_PROCESSOR_ID";
+    default:
+        okay = 1;
+        break;
     }
+    printk("%s %d: %s(%lx)[%lx=%lx]\n", __func__, v->vcpu_id,
+           msg ? msg : "",vmcs_encoding, val, operand);
 
     vmreturn(regs, okay ? VMSUCCEED : VMFAIL_VALID);
 
diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h
index 152d9f3..b69a584 100644
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -130,6 +130,9 @@  struct nestedvcpu {
 
     /* L2's control-resgister, just as the L2 sees them. */
     unsigned long       guest_cr[5];
+
+    unsigned long debug[3]; /* 0 - TPR, 1 - VIRTUAL APIC, 2 - APIC_ADDR */
+    unsigned long debug_cnt[3]; /* 0 - TPR, 1 - VIRTUAL APIC, 2 - APIC_ADDR */
 };
 
 #define vcpu_nestedhvm(v) ((v)->arch.hvm_vcpu.nvcpu)