diff mbox

[7/8] kvm: nVMX: Introduce KVM_CAP_VMX_STATE

Message ID 1480536229-11754-8-git-send-email-jmattson@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jim Mattson Nov. 30, 2016, 8:03 p.m. UTC
With this capability, there are two new vcpu ioctls:
KVM_GET_VMX_STATE and KVM_SET_VMX_STATE. These can be used
for saving and restoring a VM that is in VMX operation.

Signed-off-by: Jim Mattson <jmattson@google.com>
---
 Documentation/virtual/kvm/api.txt |  44 ++++++++++++
 arch/x86/include/asm/kvm_host.h   |   5 ++
 arch/x86/include/uapi/asm/kvm.h   |  12 ++++
 arch/x86/kvm/vmx.c                | 138 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c                |  19 ++++++
 include/uapi/linux/kvm.h          |   4 ++
 6 files changed, 222 insertions(+)

Comments

Paolo Bonzini Dec. 9, 2016, 3:26 p.m. UTC | #1
On 30/11/2016 21:03, Jim Mattson wrote:
> +#define KVM_VMX_STATE_GUEST_MODE	0x00000001
> +#define KVM_VMX_STATE_RUN_PENDING	0x00000002
> +
> +/* for KVM_CAP_VMX_STATE */
> +struct kvm_vmx_state {
> +	__u64 vmxon_ptr;
> +	__u64 current_vmptr;
> +	__u32 flags;
> +	__u32 data_size;
> +	__u8 data[0];
> +};

Let's prepare the API for nested SVM too.  Please rename it to
KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:

	/* In addition to guest mode and run pending, please
	 * add one for GIF.
	 */
	__u16 flags;
	/* 0 for VMX, 1 for SVM.  */
	__u16 format;
	/* 128 for SVM, 128 + VMCS size for VMX.  */
	__u32 size;
	/* Both structs padded to 120 bytes.  */
	union {
		/* VMXON, VMCS */
		struct kvm_vmx_state vmx;
		/* HSAVE_PA, VMCB */
		struct kvm_svm_state svm;
	}
	__u8 data[0];

David, would the above make sense for s390 nested SIE too?

Thanks,

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Hildenbrand Dec. 9, 2016, 3:55 p.m. UTC | #2
Am 09.12.2016 um 16:26 schrieb Paolo Bonzini:
>
>
> On 30/11/2016 21:03, Jim Mattson wrote:
>> +#define KVM_VMX_STATE_GUEST_MODE	0x00000001
>> +#define KVM_VMX_STATE_RUN_PENDING	0x00000002
>> +
>> +/* for KVM_CAP_VMX_STATE */
>> +struct kvm_vmx_state {
>> +	__u64 vmxon_ptr;
>> +	__u64 current_vmptr;
>> +	__u32 flags;
>> +	__u32 data_size;
>> +	__u8 data[0];
>> +};
>
> Let's prepare the API for nested SVM too.  Please rename it to
> KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:
>
> 	/* In addition to guest mode and run pending, please
> 	 * add one for GIF.
> 	 */
> 	__u16 flags;
> 	/* 0 for VMX, 1 for SVM.  */
> 	__u16 format;
> 	/* 128 for SVM, 128 + VMCS size for VMX.  */
> 	__u32 size;
> 	/* Both structs padded to 120 bytes.  */
> 	union {
> 		/* VMXON, VMCS */
> 		struct kvm_vmx_state vmx;
> 		/* HSAVE_PA, VMCB */
> 		struct kvm_svm_state svm;
> 	}
> 	__u8 data[0];
>
> David, would the above make sense for s390 nested SIE too?
>

s390x doesn't have _any_ SIE state in the hardware. All g2 state is in
g1 memory and therefore migrated. So there is nothing needed at that
point for migration. It just works :)

shadowed SCBs (SIE control blocks) are simply recreated on the target
(after every VSIE execution, we write the data back into g1, so
whenever we leave the interception handler, we act according to the SIE
architecture).

I would have thought that migrating the current VMLOAD and VMXON PTR (+
VMX state) should also be enough for VMX, but I haven't looked into the
details of vmcs shadowing/caching yet that Jim mentioned.

BTW, I assume we can't migrate while having a nested guest from Intel
to AMD. Are there any checks in place for that? (being new to x86, is
it even possible at all to migrate a guest from AMD <-> Intel? I assume
so with apropriate CPU models).
Paolo Bonzini Dec. 9, 2016, 7:26 p.m. UTC | #3
On 09/12/2016 16:55, David Hildenbrand wrote:
> Am 09.12.2016 um 16:26 schrieb Paolo Bonzini:
>>
>>
>> On 30/11/2016 21:03, Jim Mattson wrote:
>>> +#define KVM_VMX_STATE_GUEST_MODE    0x00000001
>>> +#define KVM_VMX_STATE_RUN_PENDING    0x00000002
>>> +
>>> +/* for KVM_CAP_VMX_STATE */
>>> +struct kvm_vmx_state {
>>> +    __u64 vmxon_ptr;
>>> +    __u64 current_vmptr;
>>> +    __u32 flags;
>>> +    __u32 data_size;
>>> +    __u8 data[0];
>>> +};
>>
>> Let's prepare the API for nested SVM too.  Please rename it to
>> KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:
>>
>>     /* In addition to guest mode and run pending, please
>>      * add one for GIF.
>>      */
>>     __u16 flags;
>>     /* 0 for VMX, 1 for SVM.  */
>>     __u16 format;
>>     /* 128 for SVM, 128 + VMCS size for VMX.  */
>>     __u32 size;
>>     /* Both structs padded to 120 bytes.  */
>>     union {
>>         /* VMXON, VMCS */
>>         struct kvm_vmx_state vmx;
>>         /* HSAVE_PA, VMCB */
>>         struct kvm_svm_state svm;
>>     }
>>     __u8 data[0];
>>
>> David, would the above make sense for s390 nested SIE too?
>>
> 
> s390x doesn't have _any_ SIE state in the hardware. All g2 state is in
> g1 memory and therefore migrated. So there is nothing needed at that
> point for migration. It just works :)
> 
> shadowed SCBs (SIE control blocks) are simply recreated on the target
> (after every VSIE execution, we write the data back into g1, so
> whenever we leave the interception handler, we act according to the SIE
> architecture).

If you get a userspace vmexit during vSIE, do you always exit the vSIE?
If not, the SIE operand is a hidden piece of processor state that you
need in order to restart execution of the vCPU.  This would be more or
less the same as what's needed for x86 AMD.

> BTW, I assume we can't migrate while having a nested guest from Intel
> to AMD. Are there any checks in place for that? (being new to x86, is
> it even possible at all to migrate a guest from AMD <-> Intel? I assume
> so with apropriate CPU models).

Yes, it's possible with appropriate CPU models, but VMX and SVM have
different bits in CPUID.  Therefore, such CPU models would never support
nested virtualization.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Hildenbrand Dec. 12, 2016, 2:14 p.m. UTC | #4
>>
>> s390x doesn't have _any_ SIE state in the hardware. All g2 state is in
>> g1 memory and therefore migrated. So there is nothing needed at that
>> point for migration. It just works :)
>>
>> shadowed SCBs (SIE control blocks) are simply recreated on the target
>> (after every VSIE execution, we write the data back into g1, so
>> whenever we leave the interception handler, we act according to the SIE
>> architecture).
>
> If you get a userspace vmexit during vSIE, do you always exit the vSIE?
> If not, the SIE operand is a hidden piece of processor state that you
> need in order to restart execution of the vCPU.  This would be more or
> less the same as what's needed for x86 AMD.

For s390x, there are no instructions emulated for the vSIE, not even in
kernel space. So every time we leave our interception handler, the vSIE
has been fully handled and the next step would be to execute our g1 again.

Please note that (v)SIE and SIE don't share any code paths. vSIE is
really just an interception handler, emulating the SIE instruction (by
calling SIE in return).

So if we drop to user space (e.g. due to a signal), the vSIE has been
fully processed and the g2 SCB block has already been copied back to
g1. When re-entering kernel space, g1 is executed (continuing after the
SIE instruction), which will in return do a handful of checks and
execute g2 again (using SIE - intercept - vSIE).

So there is no hidden piece of processor state. Everytime we drop to
user space, everything is properly stored in g1 memory and therefore
migrated. As these user space exits are really rare, we don't have to
optimize for that. And I think if that should ever be needed, this 
interface should be sufficient to extract/set hidden state.
Paolo Bonzini Feb. 15, 2017, 11:56 a.m. UTC | #5
On 09/12/2016 16:26, Paolo Bonzini wrote:
> Let's prepare the API for nested SVM too.  Please rename it to
> KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:
> 
> 	/* In addition to guest mode and run pending, please
> 	 * add one for GIF.
> 	 */
> 	__u16 flags;
> 	/* 0 for VMX, 1 for SVM.  */
> 	__u16 format;
> 	/* 128 for SVM, 128 + VMCS size for VMX.  */
> 	__u32 size;
> 	/* Both structs padded to 120 bytes.  */
> 	union {
> 		/* VMXON, VMCS */
> 		struct kvm_vmx_state vmx;
> 		/* HSAVE_PA, VMCB */
> 		struct kvm_svm_state svm;
> 	}
> 	__u8 data[0];

Thinking again about this, do we really need to store the VMCS in the
ioctl?  What would be the issues if we just flushed it to memory and
reloaded it at KVM_REQ_GET_VMCS12_PAGES time?

Paolo
Jim Mattson Feb. 15, 2017, 4:06 p.m. UTC | #6
The VMCS cache can be safely flushed to guest memory at any time.
However, I think your proposal has some unfortunate consequences:

1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
2. Guest memory (at least the cached VMCS page(s)) has to be saved
after KVM_GET_NESTED_STATE.
3. KVM_GET_NESTED_STATE is not transparent to the guest.


On Wed, Feb 15, 2017 at 3:56 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 09/12/2016 16:26, Paolo Bonzini wrote:
>> Let's prepare the API for nested SVM too.  Please rename it to
>> KVM_CAP/GET/SET_NESTED_STATE and let's organize it like this:
>>
>>       /* In addition to guest mode and run pending, please
>>        * add one for GIF.
>>        */
>>       __u16 flags;
>>       /* 0 for VMX, 1 for SVM.  */
>>       __u16 format;
>>       /* 128 for SVM, 128 + VMCS size for VMX.  */
>>       __u32 size;
>>       /* Both structs padded to 120 bytes.  */
>>       union {
>>               /* VMXON, VMCS */
>>               struct kvm_vmx_state vmx;
>>               /* HSAVE_PA, VMCB */
>>               struct kvm_svm_state svm;
>>       }
>>       __u8 data[0];
>
> Thinking again about this, do we really need to store the VMCS in the
> ioctl?  What would be the issues if we just flushed it to memory and
> reloaded it at KVM_REQ_GET_VMCS12_PAGES time?
>
> Paolo
Paolo Bonzini Feb. 15, 2017, 4:14 p.m. UTC | #7
On 15/02/2017 17:06, Jim Mattson wrote:
> The VMCS cache can be safely flushed to guest memory at any time.
> However, I think your proposal has some unfortunate consequences:
> 
> 1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
> operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
> 2. Guest memory (at least the cached VMCS page(s)) has to be saved
> after KVM_GET_NESTED_STATE.
> 3. KVM_GET_NESTED_STATE is not transparent to the guest.

I can't choose which is the worst of the three. :)

A better one perhaps is to flush the VMCS cache on L2->userspace exit,
since that should be pretty rare (suggested by David).  I think that
would help at least with (2) and (3).

As to (1), after KVM_SET_NESTED_STATE sets the in-guest-mode flag you
don't really need to reload all of the vmcs12 into vmcs02.  Only the
host state needs to be reloaded, while the guest state is set with
KVM_SET_SREGS and others.

Paolo
Jim Mattson Feb. 15, 2017, 4:58 p.m. UTC | #8
Flushing the VMCS cache on L2->userspace exit solves (2), but it means
that L2->userspace exit is not transparent to the guest. Is this
better than (3)?

There is some redundancy in the VMCS cache state currently saved by
GET_NESTED_STATE and the VCPU state saved by GET_SREGS and others, but
it does simplify the vmcs02 initialization. (As an aside, this might
be a lot easier if we eliminated the vmcs02 entirely, as I've
suggested in the past.)

To be honest, I'm not sure what it is that you are trying to optimize
for. Is your concern the extra 4K of VCPU state? I would argue that
the VMCS cache is part of the physical CPU state (though on a physical
CPU, not every field is cached), and that it therefore makes sense to
include the VMCS cache as part of the VCPU state, apart from guest
physical memory.

On Wed, Feb 15, 2017 at 8:14 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/02/2017 17:06, Jim Mattson wrote:
>> The VMCS cache can be safely flushed to guest memory at any time.
>> However, I think your proposal has some unfortunate consequences:
>>
>> 1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
>> operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
>> 2. Guest memory (at least the cached VMCS page(s)) has to be saved
>> after KVM_GET_NESTED_STATE.
>> 3. KVM_GET_NESTED_STATE is not transparent to the guest.
>
> I can't choose which is the worst of the three. :)
>
> A better one perhaps is to flush the VMCS cache on L2->userspace exit,
> since that should be pretty rare (suggested by David).  I think that
> would help at least with (2) and (3).
>
> As to (1), after KVM_SET_NESTED_STATE sets the in-guest-mode flag you
> don't really need to reload all of the vmcs12 into vmcs02.  Only the
> host state needs to be reloaded, while the guest state is set with
> KVM_SET_SREGS and others.
>
> Paolo
Paolo Bonzini Feb. 15, 2017, 5:37 p.m. UTC | #9
On 15/02/2017 17:58, Jim Mattson wrote:
> Flushing the VMCS cache on L2->userspace exit solves (2), but it means
> that L2->userspace exit is not transparent to the guest. Is this
> better than (3)?

Yes, I think it is better.  The really nasty part of
"KVM_GET_NESTED_STATE is not transparent to the guest" is that
KVM_GET_NESTED_STATE typically happens while the guest is quiescent and
userspace doesn't expect guest memory to change.  (2) is a special case
of this.

Instead, flushing the VMCS on L2->userspace exit happens during KVM_RUN,
which should not break any invariants that userspace relies on.

> There is some redundancy in the VMCS cache state currently saved by
> GET_NESTED_STATE and the VCPU state saved by GET_SREGS and others, but
> it does simplify the vmcs02 initialization. (As an aside, this might
> be a lot easier if we eliminated the vmcs02 entirely, as I've
> suggested in the past.)

Do you have patches to eliminate the vmcs02 or was it just a proposal?

As things stand now, almost all of the vmcs01 has to be reloaded anyway
from the vmcs12's host state.  But as I said earlier we could be
different if we compared vmcs01 and vmcs12's state (especially the
descriptor caches) and optimize load_vmcs12_host_state consequently.
Somebody needs to write the code and benchmark it with vmexit.flat...

> To be honest, I'm not sure what it is that you are trying to optimize
> for. Is your concern the extra 4K of VCPU state? I would argue that
> the VMCS cache is part of the physical CPU state (though on a physical
> CPU, not every field is cached), and that it therefore makes sense to
> include the VMCS cache as part of the VCPU state, apart from guest
> physical memory.

It's not really the extra 4K, but rather the ugly "blob" aspect of it
and having to maintain compatibility for it.  Regarding the blobbiness,
see for example KVM_GET/SET_XSAVE, where you can actually marshal and
unmarshal the contents of the XSAVE area into the registers.  Regarding
compatibility, I'm worried that once someone looks at SVM more closely,
we will have to store the VMCB in GET_NESTED_STATE too.

Paolo

> On Wed, Feb 15, 2017 at 8:14 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>> On 15/02/2017 17:06, Jim Mattson wrote:
>>> The VMCS cache can be safely flushed to guest memory at any time.
>>> However, I think your proposal has some unfortunate consequences:
>>>
>>> 1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
>>> operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
>>> 2. Guest memory (at least the cached VMCS page(s)) has to be saved
>>> after KVM_GET_NESTED_STATE.
>>> 3. KVM_GET_NESTED_STATE is not transparent to the guest.
>>
>> I can't choose which is the worst of the three. :)
>>
>> A better one perhaps is to flush the VMCS cache on L2->userspace exit,
>> since that should be pretty rare (suggested by David).  I think that
>> would help at least with (2) and (3).
>>
>> As to (1), after KVM_SET_NESTED_STATE sets the in-guest-mode flag you
>> don't really need to reload all of the vmcs12 into vmcs02.  Only the
>> host state needs to be reloaded, while the guest state is set with
>> KVM_SET_SREGS and others.
>>
>> Paolo
Jim Mattson Feb. 15, 2017, 6:19 p.m. UTC | #10
On Wed, Feb 15, 2017 at 9:37 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/02/2017 17:58, Jim Mattson wrote:
>> Flushing the VMCS cache on L2->userspace exit solves (2), but it means
>> that L2->userspace exit is not transparent to the guest. Is this
>> better than (3)?
>
> Yes, I think it is better.  The really nasty part of
> "KVM_GET_NESTED_STATE is not transparent to the guest" is that
> KVM_GET_NESTED_STATE typically happens while the guest is quiescent and
> userspace doesn't expect guest memory to change.  (2) is a special case
> of this.
>
> Instead, flushing the VMCS on L2->userspace exit happens during KVM_RUN,
> which should not break any invariants that userspace relies on.

It is certainly within the bounds of the Intel specification to flush
the VMCS cache
at any time, but I think it would be better if the VCPU behaved more
deterministically (with respect to guest-visible events).

>> There is some redundancy in the VMCS cache state currently saved by
>> GET_NESTED_STATE and the VCPU state saved by GET_SREGS and others, but
>> it does simplify the vmcs02 initialization. (As an aside, this might
>> be a lot easier if we eliminated the vmcs02 entirely, as I've
>> suggested in the past.)
>
> Do you have patches to eliminate the vmcs02 or was it just a proposal?

Just a proposal. I think a separate vmcs02 has the potential for faster emulated
VM-entry/VM-exit, but at the cost of greater code complexity. A unified vmcs0X
could certainly be as fast as the existing implementation and a lot simpler.

>
> As things stand now, almost all of the vmcs01 has to be reloaded anyway
> from the vmcs12's host state.  But as I said earlier we could be
> different if we compared vmcs01 and vmcs12's state (especially the
> descriptor caches) and optimize load_vmcs12_host_state consequently.
> Somebody needs to write the code and benchmark it with vmexit.flat...
>
>> To be honest, I'm not sure what it is that you are trying to optimize
>> for. Is your concern the extra 4K of VCPU state? I would argue that
>> the VMCS cache is part of the physical CPU state (though on a physical
>> CPU, not every field is cached), and that it therefore makes sense to
>> include the VMCS cache as part of the VCPU state, apart from guest
>> physical memory.
>
> It's not really the extra 4K, but rather the ugly "blob" aspect of it
> and having to maintain compatibility for it.  Regarding the blobbiness,
> see for example KVM_GET/SET_XSAVE, where you can actually marshal and
> unmarshal the contents of the XSAVE area into the registers.  Regarding
> compatibility, I'm worried that once someone looks at SVM more closely,
> we will have to store the VMCB in GET_NESTED_STATE too.

Instead of a "blob," we could pass it out to userspace as a 'struct
vmcs12,' padded
out to 4k for compatibility purposes. Then userspace could marshal and unmarshal
the contents, and it would be a little easier than marshalling and
unmarshalling the
same contents in guest physical memory.

I don't know what to say about SVM. I think we should be prepared to
store the VMCB
at some point, even if we don't now.

BTW, eventually I'm going to want to have two entries in the VMCS
cache to improve
the performance of virtualized VMCS shadowing.

>
> Paolo
>
>> On Wed, Feb 15, 2017 at 8:14 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>>
>>> On 15/02/2017 17:06, Jim Mattson wrote:
>>>> The VMCS cache can be safely flushed to guest memory at any time.
>>>> However, I think your proposal has some unfortunate consequences:
>>>>
>>>> 1. If KVM_SET_NESTED_STATE is asynchronous, then any subsequent set
>>>> operations (e.g. KVM_SET_SREGS) may be overridden on the next KVM_RUN.
>>>> 2. Guest memory (at least the cached VMCS page(s)) has to be saved
>>>> after KVM_GET_NESTED_STATE.
>>>> 3. KVM_GET_NESTED_STATE is not transparent to the guest.
>>>
>>> I can't choose which is the worst of the three. :)
>>>
>>> A better one perhaps is to flush the VMCS cache on L2->userspace exit,
>>> since that should be pretty rare (suggested by David).  I think that
>>> would help at least with (2) and (3).
>>>
>>> As to (1), after KVM_SET_NESTED_STATE sets the in-guest-mode flag you
>>> don't really need to reload all of the vmcs12 into vmcs02.  Only the
>>> host state needs to be reloaded, while the guest state is set with
>>> KVM_SET_SREGS and others.
>>>
>>> Paolo
Paolo Bonzini Feb. 15, 2017, 9:28 p.m. UTC | #11
On 15/02/2017 19:19, Jim Mattson wrote:
> On Wed, Feb 15, 2017 at 9:37 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Yes, I think it is better.  The really nasty part of
>> "KVM_GET_NESTED_STATE is not transparent to the guest" is that
>> KVM_GET_NESTED_STATE typically happens while the guest is quiescent and
>> userspace doesn't expect guest memory to change.  (2) is a special case
>> of this.
>>
>> Instead, flushing the VMCS on L2->userspace exit happens during KVM_RUN,
>> which should not break any invariants that userspace relies on.
> 
> It is certainly within the bounds of the Intel specification to flush the
> VMCS cache at any time, but I think it would be better if the VCPU behaved more
> deterministically (with respect to guest-visible events).

Yes, it's a trade off.  Of course if you think of SMIs on physical
hardware, determinism is already thrown away.  But I understand that we
can do better than physical processors in this regard. :)

> Just a proposal. I think a separate vmcs02 has the potential for faster emulated
> VM-entry/VM-exit, but at the cost of greater code complexity. A unified vmcs0X
> could certainly be as fast as the existing implementation and a lot simpler.

I agree.  As I said, we need to benchmark it either way.  I honestly
have no idea where time is spent (on a processor with VMCS shadowing) in
a tight loop of L1->L2->L1 entries and exits.

>>> To be honest, I'm not sure what it is that you are trying to optimize
>>> for. Is your concern the extra 4K of VCPU state? I would argue that
>>> the VMCS cache is part of the physical CPU state (though on a physical
>>> CPU, not every field is cached), and that it therefore makes sense to
>>> include the VMCS cache as part of the VCPU state, apart from guest
>>> physical memory.
>>
>> It's not really the extra 4K, but rather the ugly "blob" aspect of it
>> and having to maintain compatibility for it.  Regarding the blobbiness,
>> see for example KVM_GET/SET_XSAVE, where you can actually marshal and
>> unmarshal the contents of the XSAVE area into the registers.  Regarding
>> compatibility, I'm worried that once someone looks at SVM more closely,
>> we will have to store the VMCB in GET_NESTED_STATE too.
> 
> Instead of a "blob," we could pass it out to userspace as a 'struct
> vmcs12,' padded out to 4k for compatibility purposes. Then userspace
> could marshal and unmarshal the contents, and it would be a little
> easier than marshalling and unmarshalling the same contents in guest
> physical memory.

You mean marshalling/unmarshalling it for stuff such as extracting the
EPTP?  Yeah, that's a good reason to keep it in GET_NESTED_STATE.

OTOH, it's more work to do in userspace.  Tradeoffs...

> I don't know what to say about SVM. I think we should be prepared to
> store the VMCB at some point, even if we don't now.

The in-memory VMCB is always consistent with processor state after
VMRUN, but not necessarily during it so I agree.  Of course the same
solution of flushing it out on L2->userspace exit would work, though I
understand why you think it's a hack.

Paolo
Jim Mattson Dec. 19, 2017, 3:07 a.m. UTC | #12
The other unfortunate thing about flushing the "current" VMCS12 state
to guest memory on each L2->userspace transition is that much of this
state is in the VMCS02. So,it's not just a matter of writing a
VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
updated from the VMCS02 by calling sync_vmcs12(). This will be
particularly bad for double-nesting, where L1 kvm has to take all of
those VMREAD VM-exits.

If you still prefer this method, I will go ahead and do it, but I
remain opposed.

On Wed, Feb 15, 2017 at 1:28 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 15/02/2017 19:19, Jim Mattson wrote:
>> On Wed, Feb 15, 2017 at 9:37 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> Yes, I think it is better.  The really nasty part of
>>> "KVM_GET_NESTED_STATE is not transparent to the guest" is that
>>> KVM_GET_NESTED_STATE typically happens while the guest is quiescent and
>>> userspace doesn't expect guest memory to change.  (2) is a special case
>>> of this.
>>>
>>> Instead, flushing the VMCS on L2->userspace exit happens during KVM_RUN,
>>> which should not break any invariants that userspace relies on.
>>
>> It is certainly within the bounds of the Intel specification to flush the
>> VMCS cache at any time, but I think it would be better if the VCPU behaved more
>> deterministically (with respect to guest-visible events).
>
> Yes, it's a trade off.  Of course if you think of SMIs on physical
> hardware, determinism is already thrown away.  But I understand that we
> can do better than physical processors in this regard. :)
>
>> Just a proposal. I think a separate vmcs02 has the potential for faster emulated
>> VM-entry/VM-exit, but at the cost of greater code complexity. A unified vmcs0X
>> could certainly be as fast as the existing implementation and a lot simpler.
>
> I agree.  As I said, we need to benchmark it either way.  I honestly
> have no idea where time is spent (on a processor with VMCS shadowing) in
> a tight loop of L1->L2->L1 entries and exits.
>
>>>> To be honest, I'm not sure what it is that you are trying to optimize
>>>> for. Is your concern the extra 4K of VCPU state? I would argue that
>>>> the VMCS cache is part of the physical CPU state (though on a physical
>>>> CPU, not every field is cached), and that it therefore makes sense to
>>>> include the VMCS cache as part of the VCPU state, apart from guest
>>>> physical memory.
>>>
>>> It's not really the extra 4K, but rather the ugly "blob" aspect of it
>>> and having to maintain compatibility for it.  Regarding the blobbiness,
>>> see for example KVM_GET/SET_XSAVE, where you can actually marshal and
>>> unmarshal the contents of the XSAVE area into the registers.  Regarding
>>> compatibility, I'm worried that once someone looks at SVM more closely,
>>> we will have to store the VMCB in GET_NESTED_STATE too.
>>
>> Instead of a "blob," we could pass it out to userspace as a 'struct
>> vmcs12,' padded out to 4k for compatibility purposes. Then userspace
>> could marshal and unmarshal the contents, and it would be a little
>> easier than marshalling and unmarshalling the same contents in guest
>> physical memory.
>
> You mean marshalling/unmarshalling it for stuff such as extracting the
> EPTP?  Yeah, that's a good reason to keep it in GET_NESTED_STATE.
>
> OTOH, it's more work to do in userspace.  Tradeoffs...
>
>> I don't know what to say about SVM. I think we should be prepared to
>> store the VMCB at some point, even if we don't now.
>
> The in-memory VMCB is always consistent with processor state after
> VMRUN, but not necessarily during it so I agree.  Of course the same
> solution of flushing it out on L2->userspace exit would work, though I
> understand why you think it's a hack.
>
> Paolo
David Hildenbrand Dec. 19, 2017, 10:35 a.m. UTC | #13
On 19.12.2017 04:07, Jim Mattson wrote:
> The other unfortunate thing about flushing the "current" VMCS12 state
> to guest memory on each L2->userspace transition is that much of this
> state is in the VMCS02. So,it's not just a matter of writing a
> VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
> updated from the VMCS02 by calling sync_vmcs12(). This will be
> particularly bad for double-nesting, where L1 kvm has to take all of
> those VMREAD VM-exits.
> 
> If you still prefer this method, I will go ahead and do it, but I
> remain opposed.

This can be easily solved by letting QEMU trigger a pre-migration ioctl.
(FLUSH_NESTED_VMCS). So that shouldn't really be the problem.
Paolo Bonzini Dec. 19, 2017, 12:45 p.m. UTC | #14
On 19/12/2017 04:07, Jim Mattson wrote:
> The other unfortunate thing about flushing the "current" VMCS12 state
> to guest memory on each L2->userspace transition is that much of this
> state is in the VMCS02. So,it's not just a matter of writing a
> VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
> updated from the VMCS02 by calling sync_vmcs12(). This will be
> particularly bad for double-nesting, where L1 kvm has to take all of
> those VMREAD VM-exits.
> 
> If you still prefer this method, I will go ahead and do it, but I
> remain opposed.

I don't (for a different reason---SVM also has off-RAM state).

Paolo
Jim Mattson Dec. 19, 2017, 5:26 p.m. UTC | #15
Yes, it can be done that way, but what makes this approach technically
superior to the original API?

On Tue, Dec 19, 2017 at 2:35 AM, David Hildenbrand <david@redhat.com> wrote:
> On 19.12.2017 04:07, Jim Mattson wrote:
>> The other unfortunate thing about flushing the "current" VMCS12 state
>> to guest memory on each L2->userspace transition is that much of this
>> state is in the VMCS02. So,it's not just a matter of writing a
>> VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
>> updated from the VMCS02 by calling sync_vmcs12(). This will be
>> particularly bad for double-nesting, where L1 kvm has to take all of
>> those VMREAD VM-exits.
>>
>> If you still prefer this method, I will go ahead and do it, but I
>> remain opposed.
>
> This can be easily solved by letting QEMU trigger a pre-migration ioctl.
> (FLUSH_NESTED_VMCS). So that shouldn't really be the problem.
>
> --
>
> Thanks,
>
> David / dhildenb
David Hildenbrand Dec. 19, 2017, 5:33 p.m. UTC | #16
On 19.12.2017 18:26, Jim Mattson wrote:
> Yes, it can be done that way, but what makes this approach technically
> superior to the original API?

a) not having to migrate data twice
b) not having to think about a proper API to get data in/out

All you need to know is, if the guest was in nested mode when migrating,
no? That would be a simple flag.

> 
> On Tue, Dec 19, 2017 at 2:35 AM, David Hildenbrand <david@redhat.com> wrote:
>> On 19.12.2017 04:07, Jim Mattson wrote:
>>> The other unfortunate thing about flushing the "current" VMCS12 state
>>> to guest memory on each L2->userspace transition is that much of this
>>> state is in the VMCS02. So,it's not just a matter of writing a
>>> VMCS12_SIZE blob to guest memory; first, the cached VMCS12 has to be
>>> updated from the VMCS02 by calling sync_vmcs12(). This will be
>>> particularly bad for double-nesting, where L1 kvm has to take all of
>>> those VMREAD VM-exits.
>>>
>>> If you still prefer this method, I will go ahead and do it, but I
>>> remain opposed.
>>
>> This can be easily solved by letting QEMU trigger a pre-migration ioctl.
>> (FLUSH_NESTED_VMCS). So that shouldn't really be the problem.
>>
>> --
>>
>> Thanks,
>>
>> David / dhildenb
David Hildenbrand Dec. 19, 2017, 5:40 p.m. UTC | #17
On 19.12.2017 18:33, David Hildenbrand wrote:
> On 19.12.2017 18:26, Jim Mattson wrote:
>> Yes, it can be done that way, but what makes this approach technically
>> superior to the original API?
> 
> a) not having to migrate data twice
> b) not having to think about a proper API to get data in/out
> 
> All you need to know is, if the guest was in nested mode when migrating,
> no? That would be a simple flag.
> 

(of course in addition, vmcsptr and if vmxon has been called).

But anyhow, if you have good reasons why you want to introduce and
maintain a new API, feel free to do so. Most likely I am missing
something important here :)
Jim Mattson Dec. 19, 2017, 7:21 p.m. UTC | #18
Certain VMX state cannot be extracted from the kernel today. As you
point out, this includes the vCPU's VMX operating mode {legacy, VMX
root operation, VMX non-root operation}, the current VMCS GPA (if
any), and the VMXON region GPA (if any). Perhaps these could be
appended to the state(s) extracted by one or more existing APIs rather
than introducing a new API, but I think there's sufficient
justification here for a new GET/SET_NESTED_STATE API.

Most L2 guest state can already be extracted by existing APIs, like
GET_SREGS. However, restoring it is a bit problematic today. SET_SREGS
will write into the current VMCS, but we have no existing mechanism
for transferring guest state from vmcs01 to vmcs02. On restore, do we
want to dictate that the vCPU's VMX operating mode has to be restored
before SET_SREGS is called, or do we provide a mechanism for
transferring vmcs01 guest state to vmcs02? If we do dictate that the
vCPU's operating mode has to be restored first, then SET_SREGS will
naturally write into vmcs02, but we'll have to create a mechanism for
building an initial vmcs02 out of nothing.

The only mechanism we have today for building a vmcs02 starts with a
vmcs12. Building on that mechanism, it is fairly straightforward to
write GET/SET_NESTED_STATE. Though there is quite a bit of redundancy
with GET/SET_SREGS, GET_SET/VCPU_EVENTS, etc., if you capture all of
the L2 state in VMCS12 format, you can restore it pretty easily using
the existing infrastructure, without worrying about the ordering of
the SET_* ioctls.

Today, the cached VMCS12 is loaded when the guest executes VMPTRLD,
primarily as a defense against the guest modifying VMCS12 fields in
memory after the hypervisor has checked their validity. There were a
lot of time-of-check to time-of-use security issues before the cached
VMCS12 was introduced. Conveniently, all but the host state of the
cached VMCS12 is dead once the vCPU enters L2, so it seemed like a
reasonable place to stuff the current L2 state for later restoration.
But why pass the cached VMCS12 as a separate vCPU state component
rather than writing it back to guest memory as part of the "save vCPU
state" sequence?

One reason is that it is a bit awkward for GET_NESTED_STATE to modify
guest memory. I don't know about qemu, but our userspace agent expects
guest memory to be quiesced by the time it starts going through its
sequence of GET_* ioctls. Sure, we could introduce a pre-migration
ioctl, but is that the best way to handle this? Another reason is that
it is a bit awkward for SET_NESTED_STATE to require guest memory.
Again, I don't know about qemu, but our userspace agent does not
expect any guest memory to be available when it starts going through
its sequence of SET_* ioctls. Sure, we could prefetch the guest page
containing the current VMCS12, but is that better than simply
including the current VMCS12 in the NESTED_STATE payload? Moreover,
these unpredictable (from the guest's point of view) updates to guest
memory leave a bad taste in my mouth (much like SMM).

Perhaps qemu doesn't have the same limitations that our userspace
agent has, and I can certainly see why you would dismiss my concerns
if you are only interested in qemu as a userspace agent for kvm. At
the same time, I hope you can understand why I am not excited to be
drawn down a path that's going to ultimately mean more headaches for
me in my environment. AFAICT, the proposed API doesn't introduce any
additional headaches for those that use qemu. The principal objections
appear to be the "blob" of data, completely unstructured in the eyes
of the userspace agent, and the redundancy with state already
extracted by existing APIs. Is that right?


On Tue, Dec 19, 2017 at 9:40 AM, David Hildenbrand <david@redhat.com> wrote:
> On 19.12.2017 18:33, David Hildenbrand wrote:
>> On 19.12.2017 18:26, Jim Mattson wrote:
>>> Yes, it can be done that way, but what makes this approach technically
>>> superior to the original API?
>>
>> a) not having to migrate data twice
>> b) not having to think about a proper API to get data in/out
>>
>> All you need to know is, if the guest was in nested mode when migrating,
>> no? That would be a simple flag.
>>
>
> (of course in addition, vmcsptr and if vmxon has been called).
>
> But anyhow, if you have good reasons why you want to introduce and
> maintain a new API, feel free to do so. Most likely I am missing
> something important here :)
>
>
> --
>
> Thanks,
>
> David / dhildenb
David Hildenbrand Dec. 19, 2017, 7:52 p.m. UTC | #19
On 19.12.2017 20:21, Jim Mattson wrote:
> Certain VMX state cannot be extracted from the kernel today. As you
> point out, this includes the vCPU's VMX operating mode {legacy, VMX
> root operation, VMX non-root operation}, the current VMCS GPA (if
> any), and the VMXON region GPA (if any). Perhaps these could be
> appended to the state(s) extracted by one or more existing APIs rather
> than introducing a new API, but I think there's sufficient
> justification here for a new GET/SET_NESTED_STATE API.
> 
> Most L2 guest state can already be extracted by existing APIs, like
> GET_SREGS. However, restoring it is a bit problematic today. SET_SREGS
> will write into the current VMCS, but we have no existing mechanism
> for transferring guest state from vmcs01 to vmcs02. On restore, do we
> want to dictate that the vCPU's VMX operating mode has to be restored
> before SET_SREGS is called, or do we provide a mechanism for
> transferring vmcs01 guest state to vmcs02? If we do dictate that the
> vCPU's operating mode has to be restored first, then SET_SREGS will
> naturally write into vmcs02, but we'll have to create a mechanism for
> building an initial vmcs02 out of nothing.

A small rant before the Christmas holiday :)

The more I look at nested VMX the more headaches I get. I know somebody
thought it was a good idea to reuse the same execution loop in the guest
for a nested guest. Simply swap some registers (introducing tons of BUGs
e.g. related to interrupt injection, at least that's my impression) and
there you go.

I think this is a huge mess. As you say, state extraction/injection is a
mess. As user space, you don't really know what you're holding in you
hand. And from that stems the desire to just "dump out a huge blob of
state and feed it back into the migration target". I can understand
that. But I wonder if it has to be this way.

Sometimes nested VMX feels like a huge hack into the existing VMX
module. I once attempted to factor out certain pieces, and finally gave
up. Maybe it is me and not the code :)

> 
> The only mechanism we have today for building a vmcs02 starts with a
> vmcs12. Building on that mechanism, it is fairly straightforward to
> write GET/SET_NESTED_STATE. Though there is quite a bit of redundancy
> with GET/SET_SREGS, GET_SET/VCPU_EVENTS, etc., if you capture all of
> the L2 state in VMCS12 format, you can restore it pretty easily using
> the existing infrastructure, without worrying about the ordering of
> the SET_* ioctls.

And exactly the redundancy you are talking about is what I am afraid of.
Interfaces should be kept simple. If we can hide complexity, do so. But
it all stems from the fact that we turn nested guest state into guest
sate. And exit that way to user space.

> 
> Today, the cached VMCS12 is loaded when the guest executes VMPTRLD,
> primarily as a defense against the guest modifying VMCS12 fields in
> memory after the hypervisor has checked their validity. There were a
> lot of time-of-check to time-of-use security issues before the cached
> VMCS12 was introduced. Conveniently, all but the host state of the
> cached VMCS12 is dead once the vCPU enters L2, so it seemed like a
> reasonable place to stuff the current L2 state for later restoration.
> But why pass the cached VMCS12 as a separate vCPU state component
> rather than writing it back to guest memory as part of the "save vCPU
> state" sequence?
> 
> One reason is that it is a bit awkward for GET_NESTED_STATE to modify
> guest memory. I don't know about qemu, but our userspace agent expects
> guest memory to be quiesced by the time it starts going through its
> sequence of GET_* ioctls. Sure, we could introduce a pre-migration
> ioctl, but is that the best way to handle this? Another reason is that
> it is a bit awkward for SET_NESTED_STATE to require guest memory.
> Again, I don't know about qemu, but our userspace agent does not
> expect any guest memory to be available when it starts going through
> its sequence of SET_* ioctls. Sure, we could prefetch the guest page
> containing the current VMCS12, but is that better than simply
> including the current VMCS12 in the NESTED_STATE payload? Moreover,
> these unpredictable (from the guest's point of view) updates to guest
> memory leave a bad taste in my mouth (much like SMM).

In my perfect world, we would never leave with nested guest registers to
user space. When migrating, all state is implicitly contained in guest
memory.

I might have a bit of a bias opinion here. On s390x we were able to make
nested virtualization work completely transparent to user space.

1. every time we exit to user space, guest registers are guest registers
2. we have a separate nested execution loop for nested virtualization.
3. nested guest state is completely contained in guest memory. Whenever
we stop executing the nested guest. (to return to the guest or to user
space)

We are able to do that by not remembering if we were in nested mode or
not. When exiting to QEMU and reentering, we simply return to L1. L1
will figure out that there is nothing to do "0 intercept" and simply
rerun its guest, resulting in us running L2. But even if we wanted to
continue running L1, it would be as easy as rewinding the PC to point
again at the SIE instruction before exiting to user space. Or introduce
a separate flag. (at least I think it would be that easy)

Not saying s390x implementation is perfect, but it seems to be way
easier to handle compared to what we have with VMX right now.

> 
> Perhaps qemu doesn't have the same limitations that our userspace
> agent has, and I can certainly see why you would dismiss my concerns
> if you are only interested in qemu as a userspace agent for kvm. At
> the same time, I hope you can understand why I am not excited to be
> drawn down a path that's going to ultimately mean more headaches for
> me in my environment. AFAICT, the proposed API doesn't introduce any
> additional headaches for those that use qemu. The principal objections
> appear to be the "blob" of data, completely unstructured in the eyes
> of the userspace agent, and the redundancy with state already
> extracted by existing APIs. Is that right?

I am wondering if we should rather try to redesign nVMX to work somehow
similar to s390x way of running nested guests before we introduce a new
API that makes the current design "fixed".

Having that said, I assume this is higly unlikely (and many people will
most likely object to what I have written in this mail). So please feel
free to ignore what I have said in this email and go forward with your
approach. Having migration is better than not having migration.

Yes, I am a dreamer. :D
Paolo Bonzini Dec. 19, 2017, 9:29 p.m. UTC | #20
On 19/12/2017 20:21, Jim Mattson wrote:
> One reason is that it is a bit awkward for GET_NESTED_STATE to modify
> guest memory. I don't know about qemu, but our userspace agent expects
> guest memory to be quiesced by the time it starts going through its
> sequence of GET_* ioctls. Sure, we could introduce a pre-migration
> ioctl, but is that the best way to handle this? Another reason is that
> it is a bit awkward for SET_NESTED_STATE to require guest memory.
> Again, I don't know about qemu, but our userspace agent does not
> expect any guest memory to be available when it starts going through
> its sequence of SET_* ioctls. Sure, we could prefetch the guest page
> containing the current VMCS12, but is that better than simply
> including the current VMCS12 in the NESTED_STATE payload? Moreover,
> these unpredictable (from the guest's point of view) updates to guest
> memory leave a bad taste in my mouth (much like SMM).

IIRC QEMU has no problem with either, but I think your concerns are
valid.  The active VMCS is processor state, not memory state.  Same for
the host save data in SVM.

The unstructured "blob" of data is not an issue.  If it becomes a
problem, we can always document the structure...

Paolo

> 
> Perhaps qemu doesn't have the same limitations that our userspace
> agent has, and I can certainly see why you would dismiss my concerns
> if you are only interested in qemu as a userspace agent for kvm. At
> the same time, I hope you can understand why I am not excited to be
> drawn down a path that's going to ultimately mean more headaches for
> me in my environment. AFAICT, the proposed API doesn't introduce any
> additional headaches for those that use qemu. The principal objections
> appear to be the "blob" of data, completely unstructured in the eyes
> of the userspace agent, and the redundancy with state already
> extracted by existing APIs. Is that right?
> 
> 
> On Tue, Dec 19, 2017 at 9:40 AM, David Hildenbrand <david@redhat.com> wrote:
>> On 19.12.2017 18:33, David Hildenbrand wrote:
>>> On 19.12.2017 18:26, Jim Mattson wrote:
>>>> Yes, it can be done that way, but what makes this approach technically
>>>> superior to the original API?
>>>
>>> a) not having to migrate data twice
>>> b) not having to think about a proper API to get data in/out
>>>
>>> All you need to know is, if the guest was in nested mode when migrating,
>>> no? That would be a simple flag.
>>>
>>
>> (of course in addition, vmcsptr and if vmxon has been called).
>>
>> But anyhow, if you have good reasons why you want to introduce and
>> maintain a new API, feel free to do so. Most likely I am missing
>> something important here :)
>>
>>
>> --
>>
>> Thanks,
>>
>> David / dhildenb
David Hildenbrand Jan. 8, 2018, 10:35 a.m. UTC | #21
On 19.12.2017 22:29, Paolo Bonzini wrote:
> On 19/12/2017 20:21, Jim Mattson wrote:
>> One reason is that it is a bit awkward for GET_NESTED_STATE to modify
>> guest memory. I don't know about qemu, but our userspace agent expects
>> guest memory to be quiesced by the time it starts going through its
>> sequence of GET_* ioctls. Sure, we could introduce a pre-migration
>> ioctl, but is that the best way to handle this? Another reason is that
>> it is a bit awkward for SET_NESTED_STATE to require guest memory.
>> Again, I don't know about qemu, but our userspace agent does not
>> expect any guest memory to be available when it starts going through
>> its sequence of SET_* ioctls. Sure, we could prefetch the guest page
>> containing the current VMCS12, but is that better than simply
>> including the current VMCS12 in the NESTED_STATE payload? Moreover,
>> these unpredictable (from the guest's point of view) updates to guest
>> memory leave a bad taste in my mouth (much like SMM).
> 
> IIRC QEMU has no problem with either, but I think your concerns are
> valid.  The active VMCS is processor state, not memory state.  Same for
> the host save data in SVM.
> 
> The unstructured "blob" of data is not an issue.  If it becomes a
> problem, we can always document the structure...

Thinking about it, I agree. It might be simpler/cleaner to transfer the
"loaded" VMCS. But I think we should take care of only transferring data
that actually is CPU state and not special to our current
implementation. (e.g. nested_run_pending I would says is special to out
current implementation, but we can discuss)

So what I would consider VMX state:
- vmxon
- vmxon_ptr
- vmptr
- cached_vmcs12
- ... ?

> 
> Paolo
Jim Mattson Jan. 8, 2018, 5:25 p.m. UTC | #22
How do we eliminate nested_run_pending? Do we enforce the invariant
that nested_run_pending is never set on return to userspace, or do we
return an error if GET_NESTED_STATE is called when nested_run_pending
is set?

On Mon, Jan 8, 2018 at 2:35 AM, David Hildenbrand <david@redhat.com> wrote:
> On 19.12.2017 22:29, Paolo Bonzini wrote:
>> On 19/12/2017 20:21, Jim Mattson wrote:
>>> One reason is that it is a bit awkward for GET_NESTED_STATE to modify
>>> guest memory. I don't know about qemu, but our userspace agent expects
>>> guest memory to be quiesced by the time it starts going through its
>>> sequence of GET_* ioctls. Sure, we could introduce a pre-migration
>>> ioctl, but is that the best way to handle this? Another reason is that
>>> it is a bit awkward for SET_NESTED_STATE to require guest memory.
>>> Again, I don't know about qemu, but our userspace agent does not
>>> expect any guest memory to be available when it starts going through
>>> its sequence of SET_* ioctls. Sure, we could prefetch the guest page
>>> containing the current VMCS12, but is that better than simply
>>> including the current VMCS12 in the NESTED_STATE payload? Moreover,
>>> these unpredictable (from the guest's point of view) updates to guest
>>> memory leave a bad taste in my mouth (much like SMM).
>>
>> IIRC QEMU has no problem with either, but I think your concerns are
>> valid.  The active VMCS is processor state, not memory state.  Same for
>> the host save data in SVM.
>>
>> The unstructured "blob" of data is not an issue.  If it becomes a
>> problem, we can always document the structure...
>
> Thinking about it, I agree. It might be simpler/cleaner to transfer the
> "loaded" VMCS. But I think we should take care of only transferring data
> that actually is CPU state and not special to our current
> implementation. (e.g. nested_run_pending I would says is special to out
> current implementation, but we can discuss)
>
> So what I would consider VMX state:
> - vmxon
> - vmxon_ptr
> - vmptr
> - cached_vmcs12
> - ... ?
>
>>
>> Paolo
>
>
> --
>
> Thanks,
>
> David / dhildenb
Paolo Bonzini Jan. 8, 2018, 5:36 p.m. UTC | #23
On 08/01/2018 11:35, David Hildenbrand wrote:
> Thinking about it, I agree. It might be simpler/cleaner to transfer the
> "loaded" VMCS. But I think we should take care of only transferring data
> that actually is CPU state and not special to our current
> implementation. (e.g. nested_run_pending I would says is special to out
> current implementation, but we can discuss)
> 
> So what I would consider VMX state:
> - vmxon
> - vmxon_ptr
> - vmptr
> - cached_vmcs12
> - ... ?

nested_run_pending is in the same boat as the various
KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
"architectural" state, but it's part of the state machine so it has to
be serialized.

Thanks,

Paolo
David Hildenbrand Jan. 8, 2018, 5:59 p.m. UTC | #24
On 08.01.2018 18:36, Paolo Bonzini wrote:
> On 08/01/2018 11:35, David Hildenbrand wrote:
>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>> "loaded" VMCS. But I think we should take care of only transferring data
>> that actually is CPU state and not special to our current
>> implementation. (e.g. nested_run_pending I would says is special to out
>> current implementation, but we can discuss)
>>
>> So what I would consider VMX state:
>> - vmxon
>> - vmxon_ptr
>> - vmptr
>> - cached_vmcs12
>> - ... ?
> 
> nested_run_pending is in the same boat as the various
> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
> "architectural" state, but it's part of the state machine so it has to
> be serialized.

I am wondering if we can get rid of it. In fact if we can go out of VMX
mode every time we go to user space. As soon as we put it into the
official VMX migration protocol, we have to support it. Now seems like
the last time we can change it.

I have the following things in mind (unfortunately I don't have time to
look into the details) to make nested_run_pending completely internal state.

1. When going into user space, if we have nested_run_pending=true, set
it to false and rewind the PSW to point again at the VMLAUNCH/VMRESUME
function. The next VCPU run will simply continue executing the nested
guest (trying to execute the VMLAUNCH/VMRESUME again).

2. When going into user space, if we have nested_run_pending=true, set
it to false and fake another VMX exit that has no side effects (if
possible). Something like the NULL intercept we have on s390x.

But I have no idea how that plays e.g. with KVM_GET_VCPU_EVENTS (again,
unfortunately no time to look into all the details). If we could get 1.
running it would be cool, but I am not sure if it is feasible.

If not, than we most likely will have to stick to it :( And as Paolo
said, migrate it.

Thanks

> 
> Thanks,
> 
> Paolo
>
Paolo Bonzini Jan. 8, 2018, 6:11 p.m. UTC | #25
On 08/01/2018 18:59, David Hildenbrand wrote:
> On 08.01.2018 18:36, Paolo Bonzini wrote:
>> On 08/01/2018 11:35, David Hildenbrand wrote:
>>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>>> "loaded" VMCS. But I think we should take care of only transferring data
>>> that actually is CPU state and not special to our current
>>> implementation. (e.g. nested_run_pending I would says is special to out
>>> current implementation, but we can discuss)
>>>
>>> So what I would consider VMX state:
>>> - vmxon
>>> - vmxon_ptr
>>> - vmptr
>>> - cached_vmcs12
>>> - ... ?
>>
>> nested_run_pending is in the same boat as the various
>> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
>> "architectural" state, but it's part of the state machine so it has to
>> be serialized.
> 
> I am wondering if we can get rid of it. In fact if we can go out of VMX
> mode every time we go to user space.

There are cases where this is not possible, for example if you have a
nested "assigned device" that is emulated by userspace.

Paolo

> As soon as we put it into the
> official VMX migration protocol, we have to support it. Now seems like
> the last time we can change it.
> 
> I have the following things in mind (unfortunately I don't have time to
> look into the details) to make nested_run_pending completely internal state.
> 
> 1. When going into user space, if we have nested_run_pending=true, set
> it to false and rewind the PSW to point again at the VMLAUNCH/VMRESUME
> function. The next VCPU run will simply continue executing the nested
> guest (trying to execute the VMLAUNCH/VMRESUME again).
> 
> 2. When going into user space, if we have nested_run_pending=true, set
> it to false and fake another VMX exit that has no side effects (if
> possible). Something like the NULL intercept we have on s390x.
> 
> But I have no idea how that plays e.g. with KVM_GET_VCPU_EVENTS (again,
> unfortunately no time to look into all the details). If we could get 1.
> running it would be cool, but I am not sure if it is feasible.
> 
> If not, than we most likely will have to stick to it :( And as Paolo
> said, migrate it.
> 
> Thanks
> 
>>
>> Thanks,
>>
>> Paolo
>>
> 
>
Jim Mattson Jan. 8, 2018, 6:17 p.m. UTC | #26
On Mon, Jan 8, 2018 at 9:59 AM, David Hildenbrand <david@redhat.com> wrote:
> On 08.01.2018 18:36, Paolo Bonzini wrote:
>> On 08/01/2018 11:35, David Hildenbrand wrote:
>>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>>> "loaded" VMCS. But I think we should take care of only transferring data
>>> that actually is CPU state and not special to our current
>>> implementation. (e.g. nested_run_pending I would says is special to out
>>> current implementation, but we can discuss)
>>>
>>> So what I would consider VMX state:
>>> - vmxon
>>> - vmxon_ptr
>>> - vmptr
>>> - cached_vmcs12
>>> - ... ?
>>
>> nested_run_pending is in the same boat as the various
>> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
>> "architectural" state, but it's part of the state machine so it has to
>> be serialized.
>
> I am wondering if we can get rid of it. In fact if we can go out of VMX
> mode every time we go to user space. As soon as we put it into the
> official VMX migration protocol, we have to support it. Now seems like
> the last time we can change it.
>
> I have the following things in mind (unfortunately I don't have time to
> look into the details) to make nested_run_pending completely internal state.
>
> 1. When going into user space, if we have nested_run_pending=true, set
> it to false and rewind the PSW to point again at the VMLAUNCH/VMRESUME
> function. The next VCPU run will simply continue executing the nested
> guest (trying to execute the VMLAUNCH/VMRESUME again).

This would require some effort, since we've already committed some
state changes that we aren't currently prepared to roll back (e.g. the
VM-entry MSR load list, DR7, etc.).

We need to address these problems anyway, if we want to continue to
defer control field checks to the hardware (as we do now) and not be
completely broken (as we are now).

> 2. When going into user space, if we have nested_run_pending=true, set
> it to false and fake another VMX exit that has no side effects (if
> possible). Something like the NULL intercept we have on s390x.

I don't think this is feasible, basically because there is no such VM-exit.

> But I have no idea how that plays e.g. with KVM_GET_VCPU_EVENTS (again,
> unfortunately no time to look into all the details). If we could get 1.
> running it would be cool, but I am not sure if it is feasible.
>
> If not, than we most likely will have to stick to it :( And as Paolo
> said, migrate it.
>
> Thanks
>
>>
>> Thanks,
>>
>> Paolo
>>
>
>
> --
>
> Thanks,
>
> David / dhildenb
David Hildenbrand Jan. 8, 2018, 6:23 p.m. UTC | #27
On 08.01.2018 19:11, Paolo Bonzini wrote:
> On 08/01/2018 18:59, David Hildenbrand wrote:
>> On 08.01.2018 18:36, Paolo Bonzini wrote:
>>> On 08/01/2018 11:35, David Hildenbrand wrote:
>>>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>>>> "loaded" VMCS. But I think we should take care of only transferring data
>>>> that actually is CPU state and not special to our current
>>>> implementation. (e.g. nested_run_pending I would says is special to out
>>>> current implementation, but we can discuss)
>>>>
>>>> So what I would consider VMX state:
>>>> - vmxon
>>>> - vmxon_ptr
>>>> - vmptr
>>>> - cached_vmcs12
>>>> - ... ?
>>>
>>> nested_run_pending is in the same boat as the various
>>> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
>>> "architectural" state, but it's part of the state machine so it has to
>>> be serialized.
>>
>> I am wondering if we can get rid of it. In fact if we can go out of VMX
>> mode every time we go to user space.
> 
> There are cases where this is not possible, for example if you have a
> nested "assigned device" that is emulated by userspace.

You mean L0 emulates a device for L1 (in userspace). L1 assigns this
device as "assigned device" to L2. Now if L2 tries to access the device,
we have to go in L0 into userspace to handle the access, therefore
requiring us to stay in nested mode so we can properly process the
request when re-entering KVM from userspace?

Didn't know that was even possible.

> 
> Paolo
Jim Mattson Jan. 8, 2018, 8:19 p.m. UTC | #28
Even more trivially, what if the L2 VM is configured never to leave
VMX non-root operation? Then we never exit to userspace?

On Mon, Jan 8, 2018 at 10:23 AM, David Hildenbrand <david@redhat.com> wrote:
> On 08.01.2018 19:11, Paolo Bonzini wrote:
>> On 08/01/2018 18:59, David Hildenbrand wrote:
>>> On 08.01.2018 18:36, Paolo Bonzini wrote:
>>>> On 08/01/2018 11:35, David Hildenbrand wrote:
>>>>> Thinking about it, I agree. It might be simpler/cleaner to transfer the
>>>>> "loaded" VMCS. But I think we should take care of only transferring data
>>>>> that actually is CPU state and not special to our current
>>>>> implementation. (e.g. nested_run_pending I would says is special to out
>>>>> current implementation, but we can discuss)
>>>>>
>>>>> So what I would consider VMX state:
>>>>> - vmxon
>>>>> - vmxon_ptr
>>>>> - vmptr
>>>>> - cached_vmcs12
>>>>> - ... ?
>>>>
>>>> nested_run_pending is in the same boat as the various
>>>> KVM_GET_VCPU_EVENTS flags (e.g. nmi.injected vs. nmi.pending).  It's not
>>>> "architectural" state, but it's part of the state machine so it has to
>>>> be serialized.
>>>
>>> I am wondering if we can get rid of it. In fact if we can go out of VMX
>>> mode every time we go to user space.
>>
>> There are cases where this is not possible, for example if you have a
>> nested "assigned device" that is emulated by userspace.
>
> You mean L0 emulates a device for L1 (in userspace). L1 assigns this
> device as "assigned device" to L2. Now if L2 tries to access the device,
> we have to go in L0 into userspace to handle the access, therefore
> requiring us to stay in nested mode so we can properly process the
> request when re-entering KVM from userspace?
>
> Didn't know that was even possible.
>
>>
>> Paolo
>
>
> --
>
> Thanks,
>
> David / dhildenb
David Hildenbrand Jan. 8, 2018, 8:27 p.m. UTC | #29
On 08.01.2018 21:19, Jim Mattson wrote:
> Even more trivially, what if the L2 VM is configured never to leave
> VMX non-root operation? Then we never exit to userspace?

Well we would make the PC point at the VMLAUNCH. Then exit to userspace.

When returning from userspace, try to execute VMLAUNCH again, leading to
an intercept and us going back into nested mode.

A question would be, what happens to interrupts. They could be
delivered, but it would look like the guest was not executed. (as we are
pointing at the VMLAUNCH instruction). Not sure of this is a purely
theoretical problem.

But the "assigned devices" thing is a real difference to s390x.
Assigning devices requires special SIE extensions we don't implement for
vSIE. And there is no such thing as MMIO.

And if there is one case where we need it (assigned devices) - even
though we might find a way around it - it is very likely that there is more.
Jim Mattson Jan. 8, 2018, 8:59 p.m. UTC | #30
On Mon, Jan 8, 2018 at 12:27 PM, David Hildenbrand <david@redhat.com> wrote:
> On 08.01.2018 21:19, Jim Mattson wrote:
>> Even more trivially, what if the L2 VM is configured never to leave
>> VMX non-root operation? Then we never exit to userspace?
>
> Well we would make the PC point at the VMLAUNCH. Then exit to userspace.

That doesn't work, for so many reasons.

1. It's not sufficient to just rollback the instruction pointer. You
also need to rollback CS, CR0, CR3 (and possibly the PDPTEs), and CR4,
so that the virtual address of the instruction pointer will actually
map to the same physical address as it did the first time. If the page
tables have changed, or the L1 guest has overwritten the
VMLAUNCH/VMRESUME instruction, then you're out of luck.
2. As you point out, interrupts are a problem. Interrupts can't be
delivered in this context, because the vCPU shouldn't be in this
context (and the guest may have already observed the transition to
L2).
3. I'm assuming that you're planning to store most of the current L2
state in the cached VMCS12, at least where you can. Even so, the next
"VM-entry" can't perform any of the normal VM-entry actions that would
clobber the current L2 state that isn't in the cached VMCS12 (e.g.
processing the VM-entry MSR load list). So, you need to have a flag
indicating that this isn't a real VM-entry. That's no better than
carrying the nested_run_pending flag.

> When returning from userspace, try to execute VMLAUNCH again, leading to
> an intercept and us going back into nested mode.
>
> A question would be, what happens to interrupts. They could be
> delivered, but it would look like the guest was not executed. (as we are
> pointing at the VMLAUNCH instruction). Not sure of this is a purely
> theoretical problem.
>
> But the "assigned devices" thing is a real difference to s390x.
> Assigning devices requires special SIE extensions we don't implement for
> vSIE. And there is no such thing as MMIO.
>
> And if there is one case where we need it (assigned devices) - even
> though we might find a way around it - it is very likely that there is more.
>
> --
>
> Thanks,
>
> David / dhildenb
David Hildenbrand Jan. 8, 2018, 9:19 p.m. UTC | #31
On 08.01.2018 21:59, Jim Mattson wrote:
> On Mon, Jan 8, 2018 at 12:27 PM, David Hildenbrand <david@redhat.com> wrote:
>> On 08.01.2018 21:19, Jim Mattson wrote:
>>> Even more trivially, what if the L2 VM is configured never to leave
>>> VMX non-root operation? Then we never exit to userspace?
>>
>> Well we would make the PC point at the VMLAUNCH. Then exit to userspace.
> 
> That doesn't work, for so many reasons.
> 
> 1. It's not sufficient to just rollback the instruction pointer. You
> also need to rollback CS, CR0, CR3 (and possibly the PDPTEs), and CR4,
> so that the virtual address of the instruction pointer will actually
> map to the same physical address as it did the first time.

I expect these values to be the same once leaving non-root mode (as the
CPU itself hasn't executed anything except the nested guest) But yes, it
could be tricky.

If the page
> tables have changed, or the L1 guest has overwritten the
> VMLAUNCH/VMRESUME instruction, then you're out of luck.

Page tables getting changed by other CPUs is actually a good point. But
I would consider both as "theoretical" problems. At least compared to
the interrupt stuff, which could also happen on guests behaving in a
more sane way.

> 2. As you point out, interrupts are a problem. Interrupts can't be
> delivered in this context, because the vCPU shouldn't be in this
> context (and the guest may have already observed the transition to
> L2).

Yes, I also see this as the major problem.

> 3. I'm assuming that you're planning to store most of the current L2
> state in the cached VMCS12, at least where you can. Even so, the next
> "VM-entry" can't perform any of the normal VM-entry actions that would
> clobber the current L2 state that isn't in the cached VMCS12 (e.g.
> processing the VM-entry MSR load list). So, you need to have a flag
> indicating that this isn't a real VM-entry. That's no better than
> carrying the nested_run_pending flag.

Not sure if that would really be necessary (would have to look into the
details first). But sounds like nested_run_pending seems unavoidable on
x86. So I'd better get used to QEMU dealing with nested CPU state (which
is somehow scary to me - an emulator getting involved in nested
execution - what could go wrong :) )

Good we talked about it (and thanks for your time). I learned a lot today!

>>
>> --
>>
>> Thanks,
>>
>> David / dhildenb
Jim Mattson Jan. 8, 2018, 9:40 p.m. UTC | #32
On Mon, Jan 8, 2018 at 1:19 PM, David Hildenbrand <david@redhat.com> wrote:
> On 08.01.2018 21:59, Jim Mattson wrote:
>> On Mon, Jan 8, 2018 at 12:27 PM, David Hildenbrand <david@redhat.com> wrote:
>>> On 08.01.2018 21:19, Jim Mattson wrote:
>>>> Even more trivially, what if the L2 VM is configured never to leave
>>>> VMX non-root operation? Then we never exit to userspace?
>>>
>>> Well we would make the PC point at the VMLAUNCH. Then exit to userspace.
>>
>> That doesn't work, for so many reasons.
>>
>> 1. It's not sufficient to just rollback the instruction pointer. You
>> also need to rollback CS, CR0, CR3 (and possibly the PDPTEs), and CR4,
>> so that the virtual address of the instruction pointer will actually
>> map to the same physical address as it did the first time.
>
> I expect these values to be the same once leaving non-root mode (as the
> CPU itself hasn't executed anything except the nested guest) But yes, it
> could be tricky.

The vmcs01 does serve as a cache of L1 state at the time of VM-entry,
so if we simply restored the vmcs01 state, that would take care of
most of the rollback issues, as long as we don't deliver any
interrupts in this context. However, I would like to see the
vmcs01/vmcs02 separation go away at some point. svm.c seems to do fine
with just once VMCB.

> If the page
>> tables have changed, or the L1 guest has overwritten the
>> VMLAUNCH/VMRESUME instruction, then you're out of luck.
>
> Page tables getting changed by other CPUs is actually a good point. But
> I would consider both as "theoretical" problems. At least compared to
> the interrupt stuff, which could also happen on guests behaving in a
> more sane way.

My preference is for solutions that are architecturally correct,
thereby solving the theoretical problems as well as the empirical
ones. However, I grant that the Linux community leans the other way in
general.

>> 2. As you point out, interrupts are a problem. Interrupts can't be
>> delivered in this context, because the vCPU shouldn't be in this
>> context (and the guest may have already observed the transition to
>> L2).
>
> Yes, I also see this as the major problem.
>
>> 3. I'm assuming that you're planning to store most of the current L2
>> state in the cached VMCS12, at least where you can. Even so, the next
>> "VM-entry" can't perform any of the normal VM-entry actions that would
>> clobber the current L2 state that isn't in the cached VMCS12 (e.g.
>> processing the VM-entry MSR load list). So, you need to have a flag
>> indicating that this isn't a real VM-entry. That's no better than
>> carrying the nested_run_pending flag.
>
> Not sure if that would really be necessary (would have to look into the
> details first). But sounds like nested_run_pending seems unavoidable on
> x86. So I'd better get used to QEMU dealing with nested CPU state (which
> is somehow scary to me - an emulator getting involved in nested
> execution - what could go wrong :) )

For save/restore, QEMU doesn't actually have to know what the flag
means. It just has to pass it on. (Our userspace agent doesn't know
anything about the meaning of the flags in the kvm_vmx_state
structure.)

> Good we talked about it (and thanks for your time). I learned a lot today!
>
>>>
>>> --
>>>
>>> Thanks,
>>>
>>> David / dhildenb
>
>
> --
>
> Thanks,
>
> David / dhildenb
David Hildenbrand Jan. 8, 2018, 9:46 p.m. UTC | #33
> The vmcs01 does serve as a cache of L1 state at the time of VM-entry,
> so if we simply restored the vmcs01 state, that would take care of
> most of the rollback issues, as long as we don't deliver any
> interrupts in this context. However, I would like to see the
> vmcs01/vmcs02 separation go away at some point. svm.c seems to do fine
> with just once VMCB.

Interesting point, might make things easier for VMX.

> 
>> If the page
>>> tables have changed, or the L1 guest has overwritten the
>>> VMLAUNCH/VMRESUME instruction, then you're out of luck.
>>
>> Page tables getting changed by other CPUs is actually a good point. But
>> I would consider both as "theoretical" problems. At least compared to
>> the interrupt stuff, which could also happen on guests behaving in a
>> more sane way.
> 
> My preference is for solutions that are architecturally correct,
> thereby solving the theoretical problems as well as the empirical
> ones. However, I grant that the Linux community leans the other way in
> general.

I usually agree, unless it makes the code horribly complicated without
any real benefit. (e.g. for corner cases like this one: a CPU modifying
instruction text of another CPU which is currently executing them)

>>> 3. I'm assuming that you're planning to store most of the current L2
>>> state in the cached VMCS12, at least where you can. Even so, the next
>>> "VM-entry" can't perform any of the normal VM-entry actions that would
>>> clobber the current L2 state that isn't in the cached VMCS12 (e.g.
>>> processing the VM-entry MSR load list). So, you need to have a flag
>>> indicating that this isn't a real VM-entry. That's no better than
>>> carrying the nested_run_pending flag.
>>
>> Not sure if that would really be necessary (would have to look into the
>> details first). But sounds like nested_run_pending seems unavoidable on
>> x86. So I'd better get used to QEMU dealing with nested CPU state (which
>> is somehow scary to me - an emulator getting involved in nested
>> execution - what could go wrong :) )
> 
> For save/restore, QEMU doesn't actually have to know what the flag
> means. It just has to pass it on. (Our userspace agent doesn't know
> anything about the meaning of the flags in the kvm_vmx_state
> structure.)

I agree on save/restore. My comment was rather about involving a L0
emulator when running a L2 guest. But as Paolo pointed out, this can't
really be avoided.
Jim Mattson Jan. 8, 2018, 9:55 p.m. UTC | #34
On Mon, Jan 8, 2018 at 1:46 PM, David Hildenbrand <david@redhat.com> wrote:
>> The vmcs01 does serve as a cache of L1 state at the time of VM-entry,
>> so if we simply restored the vmcs01 state, that would take care of
>> most of the rollback issues, as long as we don't deliver any
>> interrupts in this context. However, I would like to see the
>> vmcs01/vmcs02 separation go away at some point. svm.c seems to do fine
>> with just once VMCB.
>
> Interesting point, might make things easier for VMX.
>
>>
>>> If the page
>>>> tables have changed, or the L1 guest has overwritten the
>>>> VMLAUNCH/VMRESUME instruction, then you're out of luck.
>>>
>>> Page tables getting changed by other CPUs is actually a good point. But
>>> I would consider both as "theoretical" problems. At least compared to
>>> the interrupt stuff, which could also happen on guests behaving in a
>>> more sane way.
>>
>> My preference is for solutions that are architecturally correct,
>> thereby solving the theoretical problems as well as the empirical
>> ones. However, I grant that the Linux community leans the other way in
>> general.
>
> I usually agree, unless it makes the code horribly complicated without
> any real benefit. (e.g. for corner cases like this one: a CPU modifying
> instruction text of another CPU which is currently executing them)

I don't feel that one additional bit of serialized state is horribly
complicated. It's aesthetically unpleasant, to be sure, but not
horribly complicated. And it's considerably less complicated than your
proposal. :-)

>>>> 3. I'm assuming that you're planning to store most of the current L2
>>>> state in the cached VMCS12, at least where you can. Even so, the next
>>>> "VM-entry" can't perform any of the normal VM-entry actions that would
>>>> clobber the current L2 state that isn't in the cached VMCS12 (e.g.
>>>> processing the VM-entry MSR load list). So, you need to have a flag
>>>> indicating that this isn't a real VM-entry. That's no better than
>>>> carrying the nested_run_pending flag.
>>>
>>> Not sure if that would really be necessary (would have to look into the
>>> details first). But sounds like nested_run_pending seems unavoidable on
>>> x86. So I'd better get used to QEMU dealing with nested CPU state (which
>>> is somehow scary to me - an emulator getting involved in nested
>>> execution - what could go wrong :) )
>>
>> For save/restore, QEMU doesn't actually have to know what the flag
>> means. It just has to pass it on. (Our userspace agent doesn't know
>> anything about the meaning of the flags in the kvm_vmx_state
>> structure.)
>
> I agree on save/restore. My comment was rather about involving a L0
> emulator when running a L2 guest. But as Paolo pointed out, this can't
> really be avoided.
>
> --
>
> Thanks,
>
> David / dhildenb
David Hildenbrand Jan. 9, 2018, 8:19 a.m. UTC | #35
On 08.01.2018 22:55, Jim Mattson wrote:
> On Mon, Jan 8, 2018 at 1:46 PM, David Hildenbrand <david@redhat.com> wrote:
>>> The vmcs01 does serve as a cache of L1 state at the time of VM-entry,
>>> so if we simply restored the vmcs01 state, that would take care of
>>> most of the rollback issues, as long as we don't deliver any
>>> interrupts in this context. However, I would like to see the
>>> vmcs01/vmcs02 separation go away at some point. svm.c seems to do fine
>>> with just once VMCB.
>>
>> Interesting point, might make things easier for VMX.
>>
>>>
>>>> If the page
>>>>> tables have changed, or the L1 guest has overwritten the
>>>>> VMLAUNCH/VMRESUME instruction, then you're out of luck.
>>>>
>>>> Page tables getting changed by other CPUs is actually a good point. But
>>>> I would consider both as "theoretical" problems. At least compared to
>>>> the interrupt stuff, which could also happen on guests behaving in a
>>>> more sane way.
>>>
>>> My preference is for solutions that are architecturally correct,
>>> thereby solving the theoretical problems as well as the empirical
>>> ones. However, I grant that the Linux community leans the other way in
>>> general.
>>
>> I usually agree, unless it makes the code horribly complicated without
>> any real benefit. (e.g. for corner cases like this one: a CPU modifying
>> instruction text of another CPU which is currently executing them)
> 
> I don't feel that one additional bit of serialized state is horribly
> complicated. It's aesthetically unpleasant, to be sure, but not
> horribly complicated. And it's considerably less complicated than your
> proposal. :-)

Well, nested_run_pending is just the tip of the ice berg of the (in my
opinion complicated) part of exiting to user space with nested state,
exposing all* VCPU ioctls to it.

But as we learned, it's the little things that cause big problems :)
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6bbceb9..8694eb9 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3198,6 +3198,50 @@  struct kvm_reinject_control {
 pit_reinject = 0 (!reinject mode) is recommended, unless running an old
 operating system that uses the PIT for timing (e.g. Linux 2.4.x).
 
+4.99 KVM_GET_VMX_STATE
+
+Capability: KVM_CAP_VMX_STATE
+Architectures: x86/vmx
+Type: vcpu ioctl
+Parameters: struct kvm_vmx_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG:     the data size exceeds the value of data_size specified by
+             the user (the size required will be written into data_size).
+
+The maximum data size is currently 8192.
+
+struct kvm_vmx_state {
+	__u64 vmxon_ptr;
+	__u64 current_vmcs;
+	__u32 flags;
+	__u32 data_size;
+	__u8 data[0];
+};
+
+This ioctl copies the vcpu's kvm_vmx_state struct from the kernel to
+userspace.
+
+
+4.100 KVM_SET_VMX_STATE
+
+Capability: KVM_CAP_VMX_STATE
+Architectures: x86/vmx
+Type: vcpu ioctl
+Parameters: struct kvm_vmx_state (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_vmx_state {
+	__u64 vmxon_ptr;
+	__u64 current_vmcs;
+	__u32 flags;
+	__u32 data_size;
+	__u8 data[0];
+};
+
+This copies the vcpu's kvm_vmx_state struct from userspace to the
+kernel.
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bdde807..d6be6f1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1020,6 +1020,11 @@  struct kvm_x86_ops {
 	void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+	int (*get_vmx_state)(struct kvm_vcpu *vcpu,
+			     struct kvm_vmx_state __user *user_vmx_state);
+	int (*set_vmx_state)(struct kvm_vcpu *vcpu,
+			     struct kvm_vmx_state __user *user_vmx_state);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 739c0c5..5aaf8bb 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -357,4 +357,16 @@  struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED	(1 << 1)
 
+#define KVM_VMX_STATE_GUEST_MODE	0x00000001
+#define KVM_VMX_STATE_RUN_PENDING	0x00000002
+
+/* for KVM_CAP_VMX_STATE */
+struct kvm_vmx_state {
+	__u64 vmxon_ptr;
+	__u64 current_vmptr;
+	__u32 flags;
+	__u32 data_size;
+	__u8 data[0];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9f0c747..d75c183 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11275,6 +11275,141 @@  static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 			~FEATURE_CONTROL_LMCE;
 }
 
+static int get_vmcs_cache(struct kvm_vcpu *vcpu,
+			  struct kvm_vmx_state __user *user_vmx_state,
+			  struct kvm_vmx_state vmx_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	/*
+	 * When running L2, the authoritative vmcs12 state is in the
+	 * vmcs02. When running L1, the authoritative vmcs12 state is
+	 * in the shadow vmcs linked to vmcs01, unless
+	 * sync_shadow_vmcs is set, in which case, the authoritative
+	 * vmcs12 state is in the vmcs12 already.
+	 */
+	if (is_guest_mode(vcpu))
+		sync_vmcs12(vcpu, vmcs12);
+	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+		copy_shadow_to_vmcs12(vmx);
+	if (copy_to_user(user_vmx_state->data, vmcs12, VMCS12_SIZE))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int get_vmx_state(struct kvm_vcpu *vcpu,
+			 struct kvm_vmx_state __user *user_vmx_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_vmx_state vmx_state = {
+		.vmxon_ptr = -1ull,
+		.current_vmptr = -1ull,
+		.flags = 0,
+		.data_size = 0
+	};
+	u32 user_data_size;
+
+	if (copy_from_user(&user_data_size, &user_vmx_state->data_size,
+			   sizeof(user_data_size)))
+		return -EFAULT;
+
+	if (nested_vmx_allowed(vcpu) && vmx->nested.vmxon) {
+		vmx_state.vmxon_ptr = vmx->nested.vmxon_ptr;
+		vmx_state.current_vmptr = vmx->nested.current_vmptr;
+		if (vmx_state.current_vmptr != -1ull)
+			vmx_state.data_size += VMCS12_SIZE;
+		if (is_guest_mode(vcpu)) {
+			vmx_state.flags |= KVM_VMX_STATE_GUEST_MODE;
+			if (vmx->nested.nested_run_pending)
+				vmx_state.flags |= KVM_VMX_STATE_RUN_PENDING;
+		}
+	}
+
+	if (copy_to_user(user_vmx_state, &vmx_state, sizeof(vmx_state)))
+		return -EFAULT;
+
+	if (user_data_size < vmx_state.data_size)
+		return -E2BIG;
+
+	if (vmx_state.data_size > 0)
+		return get_vmcs_cache(vcpu, user_vmx_state, vmx_state);
+
+	return 0;
+}
+
+static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
+}
+
+static int set_vmcs_cache(struct kvm_vcpu *vcpu,
+			  struct kvm_vmx_state __user *user_vmx_state,
+			  struct kvm_vmx_state vmx_state)
+
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u32 exit_qual;
+
+	if (vmx_state.data_size < VMCS12_SIZE ||
+	    vmx_state.current_vmptr == vmx_state.vmxon_ptr ||
+	    !page_address_valid(vcpu, vmx_state.current_vmptr))
+		return -EINVAL;
+	if (copy_from_user(vmcs12, user_vmx_state->data, VMCS12_SIZE))
+		return -EFAULT;
+	if (vmcs12->revision_id != VMCS12_REVISION)
+		return -EINVAL;
+	set_current_vmptr(vmx, vmx_state.current_vmptr);
+	if (enable_shadow_vmcs)
+		vmx->nested.sync_shadow_vmcs = true;
+	if (!(vmx_state.flags & KVM_VMX_STATE_GUEST_MODE))
+		return 0;
+
+	if (check_vmentry_prereqs(vcpu, vmcs12) ||
+	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+		return -EINVAL;
+	return enter_vmx_non_root_mode(vcpu);
+}
+
+static int set_vmx_state(struct kvm_vcpu *vcpu,
+			 struct kvm_vmx_state __user *user_vmx_state)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_vmx_state vmx_state;
+	int ret;
+
+	if (copy_from_user(&vmx_state, user_vmx_state, sizeof(vmx_state)))
+		return -EFAULT;
+
+	if (vmx_state.flags &
+	    ~(KVM_VMX_STATE_RUN_PENDING | KVM_VMX_STATE_GUEST_MODE))
+		return -EINVAL;
+
+	if (!nested_vmx_allowed(vcpu))
+		return vmx_state.vmxon_ptr == -1ull ? 0 : -EINVAL;
+
+	vmx_leave_nested(vcpu);
+
+	vmx->nested.nested_run_pending =
+		!!(vmx_state.flags & KVM_VMX_STATE_RUN_PENDING);
+	if (vmx_state.vmxon_ptr == -1ull)
+		return 0;
+
+	if (!page_address_valid(vcpu, vmx_state.vmxon_ptr))
+		return -EINVAL;
+	vmx->nested.vmxon_ptr = vmx_state.vmxon_ptr;
+	ret = enter_vmx_operation(vcpu);
+	if (ret)
+		return ret;
+
+	if (vmx_state.current_vmptr == -1ull)
+		return 0;
+
+	return set_vmcs_cache(vcpu, user_vmx_state, vmx_state);
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -11403,6 +11538,9 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 #endif
 
 	.setup_mce = vmx_setup_mce,
+
+	.get_vmx_state = get_vmx_state,
+	.set_vmx_state = set_vmx_state,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04c5d96..e249215 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2685,6 +2685,9 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X2APIC_API:
 		r = KVM_X2APIC_API_VALID_FLAGS;
 		break;
+	case KVM_CAP_VMX_STATE:
+		r = !!kvm_x86_ops->get_vmx_state;
+		break;
 	default:
 		r = 0;
 		break;
@@ -3585,6 +3588,22 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
+	case KVM_GET_VMX_STATE: {
+		struct kvm_vmx_state __user *user_vmx_state = argp;
+
+		r = -EINVAL;
+		if (kvm_x86_ops->get_vmx_state)
+			r = kvm_x86_ops->get_vmx_state(vcpu, user_vmx_state);
+		goto out;
+	}
+	case KVM_SET_VMX_STATE: {
+		struct kvm_vmx_state __user *user_vmx_state = argp;
+
+		r = -EINVAL;
+		if (kvm_x86_ops->set_vmx_state)
+			r = kvm_x86_ops->set_vmx_state(vcpu, user_vmx_state);
+		goto out;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4ee67cb..ba3c586 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -870,6 +870,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_VMX_STATE 133
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1280,6 +1281,9 @@  struct kvm_s390_ucas_mapping {
 #define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
 /* Available with KVM_CAP_X86_SMM */
 #define KVM_SMI                   _IO(KVMIO,   0xb7)
+/* Available with KVM_CAP_VMX_STATE */
+#define KVM_GET_VMX_STATE         _IOWR(KVMIO, 0xb8, struct kvm_vmx_state)
+#define KVM_SET_VMX_STATE         _IOW(KVMIO,  0xb9, struct kvm_vmx_state)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)