Patchworkβ [v3,2/4] KVM: Add unified KVM_GET/SET_VCPU_STATE IOCTL

login
register
about
Submitter Jan Kiszka
Date 2009-11-02 16:20:29
Message ID <20091102162028.19049.64564.stgit@mchn012c.ww002.siemens.net>
Download mbox | patch
Permalink /patch/57070/
State New
Headers show

Comments

Jan Kiszka - 2009-11-02 16:20:29
Add a new IOCTL pair to retrieve or set the VCPU state in one chunk.
More precisely, the IOCTL is able to process a list of substates to be
read or written. This list is easily extensible without breaking the
existing ABI, thus we will no longer have to add new IOCTLs when we
discover a missing VCPU state field or want to support new hardware
features.

This patch establishes the generic infrastructure for KVM_GET/
SET_VCPU_STATE, introduces KVM_CHECK_VCPU_STATES to query supported
substates, and adds support for the generic substates REGS, SREGS, FPU,
and MP. To avoid code duplication, the entry point for the corresponding
original IOCTLs are converted to make use of the new infrastructure
internally, too.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---

 Documentation/kvm/api.txt  |   94 +++++++++++
 arch/ia64/kvm/kvm-ia64.c   |   17 ++
 arch/powerpc/kvm/powerpc.c |   17 ++
 arch/s390/kvm/kvm-s390.c   |   17 ++
 arch/x86/kvm/x86.c         |   17 ++
 include/linux/kvm.h        |   32 ++++
 include/linux/kvm_host.h   |    6 +
 virt/kvm/kvm_main.c        |  367 +++++++++++++++++++++++++++++++++-----------
 8 files changed, 475 insertions(+), 92 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity - 2009-11-04 11:18:42
On 11/02/2009 06:20 PM, Jan Kiszka wrote:
> Add a new IOCTL pair to retrieve or set the VCPU state in one chunk.
> More precisely, the IOCTL is able to process a list of substates to be
> read or written. This list is easily extensible without breaking the
> existing ABI, thus we will no longer have to add new IOCTLs when we
> discover a missing VCPU state field or want to support new hardware
> features.
>
> This patch establishes the generic infrastructure for KVM_GET/
> SET_VCPU_STATE, introduces KVM_CHECK_VCPU_STATES to query supported
> substates, and adds support for the generic substates REGS, SREGS, FPU,
> and MP. To avoid code duplication, the entry point for the corresponding
> original IOCTLs are converted to make use of the new infrastructure
> internally, too.
>
> +
> +struct kvm_vcpu_substate {
> +	__u32 type;	/* KVM_VCPU_STATE_* or KVM_$(ARCH)_VCPU_STATE_* */
> +	__u32 pad;
> +	__s64 offset;	/* payload offset to kvm_vcpu_state in bytes */
> +};
>    

64-bit offset is a bit too much, and can't be made to work on 32-bit 
hosts.  Also, why signed?  it's begging for trouble.
Jan Kiszka - 2009-11-04 11:35:53
Avi Kivity wrote:
> On 11/02/2009 06:20 PM, Jan Kiszka wrote:
>> Add a new IOCTL pair to retrieve or set the VCPU state in one chunk.
>> More precisely, the IOCTL is able to process a list of substates to be
>> read or written. This list is easily extensible without breaking the
>> existing ABI, thus we will no longer have to add new IOCTLs when we
>> discover a missing VCPU state field or want to support new hardware
>> features.
>>
>> This patch establishes the generic infrastructure for KVM_GET/
>> SET_VCPU_STATE, introduces KVM_CHECK_VCPU_STATES to query supported
>> substates, and adds support for the generic substates REGS, SREGS, FPU,
>> and MP. To avoid code duplication, the entry point for the corresponding
>> original IOCTLs are converted to make use of the new infrastructure
>> internally, too.
>>
>> +
>> +struct kvm_vcpu_substate {
>> +	__u32 type;	/* KVM_VCPU_STATE_* or KVM_$(ARCH)_VCPU_STATE_* */
>> +	__u32 pad;
>> +	__s64 offset;	/* payload offset to kvm_vcpu_state in bytes */
>> +};
>>    
> 
> 64-bit offset is a bit too much, and can't be made to work on 32-bit 
> hosts.  Also, why signed?  it's begging for trouble.

Please elaborate what troubles you see precisely. The offset is supposed
to make the whole user's address space reachable relative to the
header's base address. Therefore 64 bit, therefore signed.

Jan
Avi Kivity - 2009-11-04 12:52:32
On 11/04/2009 01:35 PM, Jan Kiszka wrote:
> Avi Kivity wrote:
>    
>> On 11/02/2009 06:20 PM, Jan Kiszka wrote:
>>      
>>> Add a new IOCTL pair to retrieve or set the VCPU state in one chunk.
>>> More precisely, the IOCTL is able to process a list of substates to be
>>> read or written. This list is easily extensible without breaking the
>>> existing ABI, thus we will no longer have to add new IOCTLs when we
>>> discover a missing VCPU state field or want to support new hardware
>>> features.
>>>
>>> This patch establishes the generic infrastructure for KVM_GET/
>>> SET_VCPU_STATE, introduces KVM_CHECK_VCPU_STATES to query supported
>>> substates, and adds support for the generic substates REGS, SREGS, FPU,
>>> and MP. To avoid code duplication, the entry point for the corresponding
>>> original IOCTLs are converted to make use of the new infrastructure
>>> internally, too.
>>>
>>> +
>>> +struct kvm_vcpu_substate {
>>> +	__u32 type;	/* KVM_VCPU_STATE_* or KVM_$(ARCH)_VCPU_STATE_* */
>>> +	__u32 pad;
>>> +	__s64 offset;	/* payload offset to kvm_vcpu_state in bytes */
>>> +};
>>>
>>>        
>> 64-bit offset is a bit too much, and can't be made to work on 32-bit
>> hosts.  Also, why signed?  it's begging for trouble.
>>      
> Please elaborate what troubles you see precisely. The offset is supposed
> to make the whole user's address space reachable relative to the
> header's base address. Therefore 64 bit, therefore signed.
>    

Misread the code; patch is fine.
Avi Kivity - 2009-11-10 10:14:23
On 11/02/2009 06:20 PM, Jan Kiszka wrote:
> Add a new IOCTL pair to retrieve or set the VCPU state in one chunk.
> More precisely, the IOCTL is able to process a list of substates to be
> read or written. This list is easily extensible without breaking the
> existing ABI, thus we will no longer have to add new IOCTLs when we
> discover a missing VCPU state field or want to support new hardware
> features.
>    

I'm having some second thoughts about this.

What does the new API buy us?  Instead of declaring two new ioctls for 
new/fixed substates, we only have to declare one.  We still have the 
capability check.  We still have to declare a structure.

It's true that the internals are currently a mess.  We can fix that with 
a table-driven approach:

static struct kvm_state_ioctl state_ioctls[] = {
    {
         .get_ioctl = KVM_GET_FPU,
         .set_ioctl = KVM_SET_FPU,
         .get = kvm_get_fpu,
         .set = kvm_set_fpu,
         .size = sizeof(struct kvm_fpu), /* 0 for variable-size state */
     },
};

(btw, the new stuff would also benefit from this).

So, what's the real justification for the new ABI?

Jan, my apologies for raising this at such a very late stage in the 
review, after all the nits have been satisfactorily addressed.  But I 
want to make sure we don't bloat the interface without very good reasons.
Jan Kiszka - 2009-11-10 12:03:29
Avi Kivity wrote:
> On 11/02/2009 06:20 PM, Jan Kiszka wrote:
>> Add a new IOCTL pair to retrieve or set the VCPU state in one chunk.
>> More precisely, the IOCTL is able to process a list of substates to be
>> read or written. This list is easily extensible without breaking the
>> existing ABI, thus we will no longer have to add new IOCTLs when we
>> discover a missing VCPU state field or want to support new hardware
>> features.
>>    
> 
> I'm having some second thoughts about this.
> 
> What does the new API buy us?  Instead of declaring two new ioctls for 
> new/fixed substates, we only have to declare one.  We still have the 
> capability check.  We still have to declare a structure.

Right, we still need CAPs to protect us against undefined types. So
KVM_CHECK_VCPU_STATES is actually pointless - well, someone asked for it...

> 
> It's true that the internals are currently a mess.  We can fix that with 
> a table-driven approach:
> 
> static struct kvm_state_ioctl state_ioctls[] = {
>     {
>          .get_ioctl = KVM_GET_FPU,
>          .set_ioctl = KVM_SET_FPU,
>          .get = kvm_get_fpu,
>          .set = kvm_set_fpu,
>          .size = sizeof(struct kvm_fpu), /* 0 for variable-size state */

Even a variable-sized state has a fixed-size header. The handlers would
have to deal with this, or we would need to define which field in the
header holds the extension size, and what is its multiplier.

>      },
> };
> 
> (btw, the new stuff would also benefit from this).

Right.

> 
> So, what's the real justification for the new ABI?

The remaining differences are:
 - single kernel call possible
 - slightly higher regularity (the IOCTL space is rather chaotic)

> 
> Jan, my apologies for raising this at such a very late stage in the 
> review, after all the nits have been satisfactorily addressed.  But I 
> want to make sure we don't bloat the interface without very good reasons.

I think we came from the idea: "Let's have one new IOCTL that will fit
it all - now and then." That's obviously not cheaply achievable. So the
valid question is what our extension concept of the future should be,
the existing multi-IOCTL approach or the substates? I only have a slight
bias towards the latter but the strong wish to achieve to a final decision.

Jan
Avi Kivity - 2009-11-10 12:57:38
On 11/10/2009 02:03 PM, Jan Kiszka wrote:
>
>> I'm having some second thoughts about this.
>>
>> What does the new API buy us?  Instead of declaring two new ioctls for
>> new/fixed substates, we only have to declare one.  We still have the
>> capability check.  We still have to declare a structure.
>>      
> Right, we still need CAPs to protect us against undefined types. So
> KVM_CHECK_VCPU_STATES is actually pointless - well, someone asked for it...
>    

It's not pointless - you can do a compile-time check for 
KVM_VCPU_STATE_... and a runtime check using KVM_CHECK_VCPU_STATES.  But 
it does duplicate the existing KVM_CAP_ functionality.

>> It's true that the internals are currently a mess.  We can fix that with
>> a table-driven approach:
>>
>> static struct kvm_state_ioctl state_ioctls[] = {
>>      {
>>           .get_ioctl = KVM_GET_FPU,
>>           .set_ioctl = KVM_SET_FPU,
>>           .get = kvm_get_fpu,
>>           .set = kvm_set_fpu,
>>           .size = sizeof(struct kvm_fpu), /* 0 for variable-size state */
>>      
> Even a variable-sized state has a fixed-size header. The handlers would
> have to deal with this, or we would need to define which field in the
> header holds the extension size, and what is its multiplier.
>    

Since we have very few variable-size states, and their number is 
unlikely to increase, ad-hoc handling should be sufficient.

>> So, what's the real justification for the new ABI?
>>      
> The remaining differences are:
>   - single kernel call possible
>    

Is there a real advantage in this?  It's not a high performance call, 
typically only called during save/restore, reset, and for vmware's 
wonderful ioport interface.

>   - slightly higher regularity (the IOCTL space is rather chaotic)
>    

But still, actually handling the state is not regular either on the 
userspace or kernel side.

>> Jan, my apologies for raising this at such a very late stage in the
>> review, after all the nits have been satisfactorily addressed.  But I
>> want to make sure we don't bloat the interface without very good reasons.
>>      
> I think we came from the idea: "Let's have one new IOCTL that will fit
> it all - now and then." That's obviously not cheaply achievable. So the
> valid question is what our extension concept of the future should be,
> the existing multi-IOCTL approach or the substates? I only have a slight
> bias towards the latter but the strong wish to achieve to a final decision.
>    

It would have been better to start from substates in the first place, 
since there is less duplication: instead of 2 x NR_STATES ioctls, we 
define 2 ioctls + NR_STATES defines.  It's more regular and less chance 
for errors (like misspelling _IOR/_IOW).

But given that we already do have the old interface, perhaps it's best 
to stick with it and concentrate on improving the internals.
Jan Kiszka - 2009-11-10 13:22:28
Avi Kivity wrote:
> On 11/10/2009 02:03 PM, Jan Kiszka wrote:
>>> I'm having some second thoughts about this.
>>>
>>> What does the new API buy us?  Instead of declaring two new ioctls for
>>> new/fixed substates, we only have to declare one.  We still have the
>>> capability check.  We still have to declare a structure.
>>>      
>> Right, we still need CAPs to protect us against undefined types. So
>> KVM_CHECK_VCPU_STATES is actually pointless - well, someone asked for it...
>>    
> 
> It's not pointless - you can do a compile-time check for 
> KVM_VCPU_STATE_... and a runtime check using KVM_CHECK_VCPU_STATES.  But 
> it does duplicate the existing KVM_CAP_ functionality.

It's redundant, therefore I considered it pointless.

> 
>>> It's true that the internals are currently a mess.  We can fix that with
>>> a table-driven approach:
>>>
>>> static struct kvm_state_ioctl state_ioctls[] = {
>>>      {
>>>           .get_ioctl = KVM_GET_FPU,
>>>           .set_ioctl = KVM_SET_FPU,
>>>           .get = kvm_get_fpu,
>>>           .set = kvm_set_fpu,
>>>           .size = sizeof(struct kvm_fpu), /* 0 for variable-size state */
>>>      
>> Even a variable-sized state has a fixed-size header. The handlers would
>> have to deal with this, or we would need to define which field in the
>> header holds the extension size, and what is its multiplier.
>>    
> 
> Since we have very few variable-size states, and their number is 
> unlikely to increase, ad-hoc handling should be sufficient.

Regarding CPU states, there is actually only the MSR interface.

> 
>>> So, what's the real justification for the new ABI?
>>>      
>> The remaining differences are:
>>   - single kernel call possible
>>    
> 
> Is there a real advantage in this?  It's not a high performance call, 
> typically only called during save/restore, reset, and for vmware's 
> wonderful ioport interface.

And debugging. But, true, this is all fairly uncritical.

> 
>>   - slightly higher regularity (the IOCTL space is rather chaotic)
>>    
> 
> But still, actually handling the state is not regular either on the 
> userspace or kernel side.
> 
>>> Jan, my apologies for raising this at such a very late stage in the
>>> review, after all the nits have been satisfactorily addressed.  But I
>>> want to make sure we don't bloat the interface without very good reasons.
>>>      
>> I think we came from the idea: "Let's have one new IOCTL that will fit
>> it all - now and then." That's obviously not cheaply achievable. So the
>> valid question is what our extension concept of the future should be,
>> the existing multi-IOCTL approach or the substates? I only have a slight
>> bias towards the latter but the strong wish to achieve to a final decision.
>>    
> 
> It would have been better to start from substates in the first place, 
> since there is less duplication: instead of 2 x NR_STATES ioctls, we 
> define 2 ioctls + NR_STATES defines.  It's more regular and less chance 
> for errors (like misspelling _IOR/_IOW).
> 
> But given that we already do have the old interface, perhaps it's best 
> to stick with it and concentrate on improving the internals.

So the new roadmap shall be like this:

 o Add KVM_X86_GET/SET_EVENT_STATES (instead of
   KVM_X86_VCPU_STATE_EVENTS)

 o Refactor in-kernel VCPU state IOCTLs to use table-driven dispatching
   and shared argument passing

 o Maybe refactor user space as well towards a table-driven state sync
   (need to think about this a bit more)

Any other comments or does everyone agree?

Jan
Avi Kivity - 2009-11-10 13:31:29
On 11/10/2009 03:22 PM, Jan Kiszka wrote:
>
>> Since we have very few variable-size states, and their number is
>> unlikely to increase, ad-hoc handling should be sufficient.
>>      
> Regarding CPU states, there is actually only the MSR interface.
>
>    

cpuid internal states too.

> So the new roadmap shall be like this:
>
>   o Add KVM_X86_GET/SET_EVENT_STATES (instead of
>     KVM_X86_VCPU_STATE_EVENTS)
>
>   o Refactor in-kernel VCPU state IOCTLs to use table-driven dispatching
>     and shared argument passing
>
>   o Maybe refactor user space as well towards a table-driven state sync
>     (need to think about this a bit more)
>
> Any other comments or does everyone agree?
>    

Looks good to me.  These are all independent, of course.
Jan Kiszka - 2009-11-10 13:41:53
Avi Kivity wrote:
> On 11/10/2009 03:22 PM, Jan Kiszka wrote:
>>> Since we have very few variable-size states, and their number is
>>> unlikely to increase, ad-hoc handling should be sufficient.
>>>      
>> Regarding CPU states, there is actually only the MSR interface.
>>
>>    
> 
> cpuid internal states too.

Oh, yes.

> 
>> So the new roadmap shall be like this:
>>
>>   o Add KVM_X86_GET/SET_EVENT_STATES (instead of
>>     KVM_X86_VCPU_STATE_EVENTS)
>>
>>   o Refactor in-kernel VCPU state IOCTLs to use table-driven dispatching
>>     and shared argument passing
>>
>>   o Maybe refactor user space as well towards a table-driven state sync
>>     (need to think about this a bit more)
>>
>> Any other comments or does everyone agree?
>>    
> 
> Looks good to me.  These are all independent, of course.

Yep. The first one will come first and ASAP, should be easy to derive
from existing series. The others have to wait a bit now.

Jan

Patch

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 36594ba..3c98d0a 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -653,6 +653,70 @@  struct kvm_clock_data {
 	__u32 pad[9];
 };
 
+4.29 KVM_GET/SET_VCPU_STATE
+
+Capability: KVM_CAP_VCPU_STATE
+Architectures: all (substate support may vary across architectures)
+Type: vcpu ioctl
+Parameters: struct kvm_vcpu_state (in/out)
+Returns: 0 on success, -1 on error
+
+Reads or sets one or more vcpu substates.
+
+The data structures exchanged between user space and kernel are organized
+in two layers. Layer one is the header structure kvm_vcpu_state:
+
+struct kvm_vcpu_state {
+	__u32 nsubstates; /* number of elements in substates */
+	__u32 nprocessed; /* return value: successfully processed substates */
+	struct kvm_vcpu_substate substates[0];
+};
+
+The kernel accepts up to KVM_MAX_VCPU_SUBSTATES elements in the substates
+array. An element is described by kvm_vcpu_substate:
+
+struct kvm_vcpu_substate {
+	__u32 type;	/* KVM_VCPU_STATE_* or KVM_$(ARCH)_VCPU_STATE_* */
+	__u32 pad;
+	__s64 offset;	/* payload offset to kvm_vcpu_state in bytes */
+};
+
+Layer two are the substate-specific payload structures. See section 6 for a
+list of supported substates and their payload format.
+
+Exemplary setup for a single-substate query via KVM_GET_VCPU_STATE:
+
+	struct {
+		struct kvm_vcpu_state header;
+		struct kvm_vcpu_substate substates[1];
+	} request;
+	struct kvm_regs regs;
+
+	request.header.nsubstates = 1;
+	request.header.substates[0].type = KVM_VCPU_STATE_REGS;
+	request.header.substates[0].offset = (size_t)&regs - (size_t)&request;
+
+4.30 KVM_CHECK_VCPU_STATES
+
+Capability: KVM_CAP_VCPU_STATE
+Architectures: all
+Type: system ioctl
+Parameters: struct kvm_vcpu_state_list (in/out)
+Returns: 0 on success, -1 on error
+
+Checks if the given list of vcpu substates is supported by the kernel.
+
+On entry, user space passes the list of substate types it wants to query in
+the following structure:
+
+struct kvm_vcpu_state_list {
+	__u32 ntypes;
+	__u32 types[0];	/* KVM_VCPU_STATE_* or KVM_$(ARCH)_VCPU_STATE_* */
+};
+
+The kernel will set all types it does not support to KVM_VCPU_STATE_INVALID on
+return.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
@@ -817,3 +881,33 @@  powerpc specific.
 		char padding[256];
 	};
 };
+
+6. Supported vcpu substates
+
+6.1 KVM_VCPU_STATE_REGS
+
+Architectures: all
+Payload: struct kvm_regs (see KVM_GET_REGS)
+Deprecates: KVM_GET/SET_REGS
+
+6.2 KVM_VCPU_STATE_SREGS
+
+Architectures: all
+Payload: struct kvm_sregs (see KVM_GET_SREGS)
+Deprecates: KVM_GET/SET_SREGS
+
+6.3 KVM_VCPU_STATE_FPU
+
+Architectures: all
+Payload: struct kvm_fpu (see KVM_GET_FPU)
+Deprecates: KVM_GET/SET_FPU
+
+6.4 KVM_VCPU_STATE_MP
+
+Architectures: x86, ia64
+Payload: struct kvm_mp_state
+Deprecates: KVM_GET/SET_MP_STATE
+
+struct kvm_mp_state {
+	__u32 mp_state;	/* KVM_MP_STATE_* */
+};
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5fdeec5..7f04603 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1991,3 +1991,20 @@  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	vcpu_put(vcpu);
 	return r;
 }
+
+int kvm_arch_vcpu_get_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate)
+{
+	return -EINVAL;
+}
+
+int kvm_arch_vcpu_set_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate)
+{
+	return -EINVAL;
+}
+
+bool kvm_arch_check_substate(u32 type)
+{
+	return false;
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 5902bbc..67f4775 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -436,3 +436,20 @@  int kvm_arch_init(void *opaque)
 void kvm_arch_exit(void)
 {
 }
+
+int kvm_arch_vcpu_get_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate)
+{
+	return -EINVAL;
+}
+
+int kvm_arch_vcpu_set_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate)
+{
+	return -EINVAL;
+}
+
+bool kvm_arch_check_substate(u32 type)
+{
+	return false;
+}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5445058..058002b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -450,6 +450,23 @@  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return -EINVAL; /* not implemented yet */
 }
 
+int kvm_arch_vcpu_get_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate)
+{
+	return -EINVAL;
+}
+
+int kvm_arch_vcpu_set_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate)
+{
+	return -EINVAL;
+}
+
+bool kvm_arch_check_substate(u32 type)
+{
+	return false;
+}
+
 static void __vcpu_run(struct kvm_vcpu *vcpu)
 {
 	memcpy(&vcpu->arch.sie_block->gg14, &vcpu->arch.guest_gprs[14], 16);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1cc51ca..816e681 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4842,6 +4842,23 @@  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
 
+int kvm_arch_vcpu_get_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate)
+{
+	return -EINVAL;
+}
+
+int kvm_arch_vcpu_set_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate)
+{
+	return -EINVAL;
+}
+
+bool kvm_arch_check_substate(u32 type)
+{
+	return false;
+}
+
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.time_page) {
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ca62b8e..473eaec 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -393,6 +393,32 @@  struct kvm_ioeventfd {
 	__u8  pad[36];
 };
 
+/* for KVM_GET_VCPU_STATE and KVM_SET_VCPU_STATE */
+#define KVM_VCPU_STATE_INVALID		0
+#define KVM_VCPU_STATE_REGS		1
+#define KVM_VCPU_STATE_SREGS		2
+#define KVM_VCPU_STATE_FPU		3
+#define KVM_VCPU_STATE_MP		4
+
+struct kvm_vcpu_substate {
+	__u32 type;	/* KVM_VCPU_STATE_* or KVM_$(ARCH)_VCPU_STATE_* */
+	__u32 pad;
+	__s64 offset;	/* payload offset to kvm_vcpu_state in bytes */
+};
+
+#define KVM_MAX_VCPU_SUBSTATES		64
+
+struct kvm_vcpu_state {
+	__u32 nsubstates; /* number of elements in substates */
+	__u32 nprocessed; /* return value: successfully processed substates */
+	struct kvm_vcpu_substate substates[0];
+};
+
+struct kvm_vcpu_state_list {
+	__u32 ntypes;
+	__u32 types[0];	/* KVM_VCPU_STATE_* or KVM_$(ARCH)_VCPU_STATE_* */
+};
+
 #define KVMIO 0xAE
 
 /*
@@ -416,6 +442,8 @@  struct kvm_ioeventfd {
 #define KVM_TRACE_ENABLE          __KVM_DEPRECATED_MAIN_W_0x06
 #define KVM_TRACE_PAUSE           __KVM_DEPRECATED_MAIN_0x07
 #define KVM_TRACE_DISABLE         __KVM_DEPRECATED_MAIN_0x08
+#define KVM_CHECK_VCPU_STATES     _IOWR(KVMIO, 0x09, \
+					struct kvm_vcpu_state_list)
 
 /*
  * Extension capability list.
@@ -484,6 +512,7 @@  struct kvm_ioeventfd {
 #define KVM_CAP_XEN_HVM 38
 #endif
 #define KVM_CAP_ADJUST_CLOCK 39
+#define KVM_CAP_VCPU_STATE 40
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -667,6 +696,9 @@  struct kvm_clock_data {
 /* IA64 stack access */
 #define KVM_IA64_VCPU_GET_STACK   _IOR(KVMIO,  0x9a, void *)
 #define KVM_IA64_VCPU_SET_STACK   _IOW(KVMIO,  0x9b, void *)
+/* Available with KVM_CAP_VCPU_STATE */
+#define KVM_GET_VCPU_STATE        _IOR(KVMIO,  0x9f, struct kvm_vcpu_state)
+#define KVM_SET_VCPU_STATE        _IOW(KVMIO,  0xa0, struct kvm_vcpu_state)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd5a616..8a39bd6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -332,6 +332,12 @@  int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 					struct kvm_guest_debug *dbg);
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 
+int kvm_arch_vcpu_get_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate);
+int kvm_arch_vcpu_set_substate(struct kvm_vcpu *vcpu, uint8_t __user *arg_base,
+			       struct kvm_vcpu_substate *substate);
+bool kvm_arch_check_substate(u32 type);
+
 int kvm_arch_init(void *opaque);
 void kvm_arch_exit(void);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bd44fb4..4623c18 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1247,124 +1247,224 @@  static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 	return 0;
 }
 
-static long kvm_vcpu_ioctl(struct file *filp,
-			   unsigned int ioctl, unsigned long arg)
+static int kvm_vcpu_get_substate(struct kvm_vcpu *vcpu,
+				 uint8_t __user *arg_base,
+				 struct kvm_vcpu_substate *substate)
 {
-	struct kvm_vcpu *vcpu = filp->private_data;
-	void __user *argp = (void __user *)arg;
+	void __user *argp = (void __user *)arg_base + substate->offset;
 	int r;
-	struct kvm_fpu *fpu = NULL;
-	struct kvm_sregs *kvm_sregs = NULL;
 
-	if (vcpu->kvm->mm != current->mm)
-		return -EIO;
-	switch (ioctl) {
-	case KVM_RUN:
-		r = -EINVAL;
-		if (arg)
-			goto out;
-		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
-		break;
-	case KVM_GET_REGS: {
+	switch (substate->type) {
+	case KVM_VCPU_STATE_REGS: {
 		struct kvm_regs *kvm_regs;
 
-		r = -ENOMEM;
 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+		r = -ENOMEM;
 		if (!kvm_regs)
-			goto out;
+			break;
 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
 		if (r)
-			goto out_free1;
+			goto out_free_regs;
 		r = -EFAULT;
 		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
-			goto out_free1;
+			goto out_free_regs;
 		r = 0;
-out_free1:
+out_free_regs:
 		kfree(kvm_regs);
 		break;
 	}
-	case KVM_SET_REGS: {
-		struct kvm_regs *kvm_regs;
+	case KVM_VCPU_STATE_SREGS: {
+		struct kvm_sregs *kvm_sregs;
 
+		kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
 		r = -ENOMEM;
-		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
-		if (!kvm_regs)
-			goto out;
-		r = -EFAULT;
-		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
-			goto out_free2;
-		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
+		if (!kvm_sregs)
+			break;
+		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
 		if (r)
-			goto out_free2;
+			goto out_free_sregs;
+		r = -EFAULT;
+		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
+			goto out_free_sregs;
 		r = 0;
-out_free2:
-		kfree(kvm_regs);
+out_free_sregs:
+		kfree(kvm_sregs);
 		break;
 	}
-	case KVM_GET_SREGS: {
-		kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+	case KVM_VCPU_STATE_FPU: {
+		struct kvm_fpu *kvm_fpu;
+
+		kvm_fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!kvm_sregs)
-			goto out;
-		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
+		if (!kvm_fpu)
+			break;
+		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, kvm_fpu);
 		if (r)
-			goto out;
+			break;
 		r = -EFAULT;
-		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
-			goto out;
+		if (copy_to_user(argp, kvm_fpu, sizeof(struct kvm_fpu)))
+			goto out_free_fpu;
 		r = 0;
+out_free_fpu:
+		kfree(kvm_fpu);
 		break;
 	}
-	case KVM_SET_SREGS: {
+	case KVM_VCPU_STATE_MP: {
+		struct kvm_mp_state mp_state;
+
+		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
+		if (r)
+			break;
+		r = -EFAULT;
+		if (copy_to_user(argp, &mp_state, sizeof(struct kvm_mp_state)))
+			break;
+		r = 0;
+		break;
+	}
+	default:
+		r = kvm_arch_vcpu_get_substate(vcpu, arg_base, substate);
+	}
+	return r;
+}
+
+static int kvm_vcpu_set_substate(struct kvm_vcpu *vcpu,
+				 uint8_t __user *arg_base,
+				 struct kvm_vcpu_substate *substate)
+{
+	void __user *argp = (void __user *)arg_base + substate->offset;
+	int r;
+
+	switch (substate->type) {
+	case KVM_VCPU_STATE_REGS: {
+		struct kvm_regs *kvm_regs;
+
+		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!kvm_regs)
+			break;
+		r = -EFAULT;
+		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
+			goto out_free_regs;
+		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
+		if (r)
+			goto out_free_regs;
+		r = 0;
+out_free_regs:
+		kfree(kvm_regs);
+		break;
+	}
+	case KVM_VCPU_STATE_SREGS: {
+		struct kvm_sregs *kvm_sregs;
+
 		kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!kvm_sregs)
-			goto out;
+			break;
 		r = -EFAULT;
 		if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
-			goto out;
+			goto out_free_sregs;
 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
 		if (r)
-			goto out;
+			goto out_free_sregs;
 		r = 0;
+out_free_sregs:
+		kfree(kvm_sregs);
 		break;
 	}
-	case KVM_GET_MP_STATE: {
-		struct kvm_mp_state mp_state;
+	case KVM_VCPU_STATE_FPU: {
+		struct kvm_fpu *kvm_fpu;
 
-		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
-		if (r)
-			goto out;
+		kvm_fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!kvm_fpu)
+			break;
 		r = -EFAULT;
-		if (copy_to_user(argp, &mp_state, sizeof mp_state))
-			goto out;
+		if (copy_from_user(kvm_fpu, argp, sizeof(struct kvm_fpu)))
+			goto out_free_fpu;
+		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, kvm_fpu);
+		if (r)
+			goto out_free_fpu;
 		r = 0;
+out_free_fpu:
+		kfree(kvm_fpu);
 		break;
 	}
-	case KVM_SET_MP_STATE: {
+	case KVM_VCPU_STATE_MP: {
 		struct kvm_mp_state mp_state;
 
 		r = -EFAULT;
-		if (copy_from_user(&mp_state, argp, sizeof mp_state))
-			goto out;
+		if (copy_from_user(&mp_state, argp,
+				   sizeof(struct kvm_mp_state)))
+			break;
 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
-		if (r)
-			goto out;
-		r = 0;
 		break;
 	}
+	default:
+		r = kvm_arch_vcpu_set_substate(vcpu, arg_base, substate);
+	}
+	return r;
+}
+
+
+static long kvm_vcpu_ioctl(struct file *filp,
+			   unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	struct kvm_vcpu_substate substate;
+	int r;
+
+	if (vcpu->kvm->mm != current->mm)
+		return -EIO;
+	switch (ioctl) {
+	case KVM_RUN:
+		r = -EINVAL;
+		if (arg)
+			break;
+		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+		break;
+	case KVM_GET_REGS:
+		substate.type = KVM_VCPU_STATE_REGS;
+		substate.offset = 0;
+		r = kvm_vcpu_get_substate(vcpu, argp, &substate);
+		break;
+	case KVM_SET_REGS:
+		substate.type = KVM_VCPU_STATE_REGS;
+		substate.offset = 0;
+		r = kvm_vcpu_set_substate(vcpu, argp, &substate);
+		break;
+	case KVM_GET_SREGS:
+		substate.type = KVM_VCPU_STATE_SREGS;
+		substate.offset = 0;
+		r = kvm_vcpu_get_substate(vcpu, argp, &substate);
+		break;
+	case KVM_SET_SREGS:
+		substate.type = KVM_VCPU_STATE_SREGS;
+		substate.offset = 0;
+		r = kvm_vcpu_set_substate(vcpu, argp, &substate);
+		break;
+	case KVM_GET_MP_STATE:
+		substate.type = KVM_VCPU_STATE_MP;
+		substate.offset = 0;
+		r = kvm_vcpu_get_substate(vcpu, argp, &substate);
+		break;
+	case KVM_SET_MP_STATE:
+		substate.type = KVM_VCPU_STATE_MP;
+		substate.offset = 0;
+		r = kvm_vcpu_set_substate(vcpu, argp, &substate);
+		break;
 	case KVM_TRANSLATE: {
 		struct kvm_translation tr;
 
 		r = -EFAULT;
 		if (copy_from_user(&tr, argp, sizeof tr))
-			goto out;
+			break;
 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
 		if (r)
-			goto out;
+			break;
 		r = -EFAULT;
 		if (copy_to_user(argp, &tr, sizeof tr))
-			goto out;
+			break;
 		r = 0;
 		break;
 	}
@@ -1373,10 +1473,10 @@  out_free2:
 
 		r = -EFAULT;
 		if (copy_from_user(&dbg, argp, sizeof dbg))
-			goto out;
+			break;
 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
 		if (r)
-			goto out;
+			break;
 		r = 0;
 		break;
 	}
@@ -1390,53 +1490,86 @@  out_free2:
 			r = -EFAULT;
 			if (copy_from_user(&kvm_sigmask, argp,
 					   sizeof kvm_sigmask))
-				goto out;
+				break;
 			r = -EINVAL;
 			if (kvm_sigmask.len != sizeof sigset)
-				goto out;
+				break;
 			r = -EFAULT;
 			if (copy_from_user(&sigset, sigmask_arg->sigset,
 					   sizeof sigset))
-				goto out;
+				break;
 			p = &sigset;
 		}
 		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
 		break;
 	}
-	case KVM_GET_FPU: {
-		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!fpu)
-			goto out;
-		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
-		if (r)
-			goto out;
-		r = -EFAULT;
-		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
-			goto out;
-		r = 0;
+	case KVM_GET_FPU:
+		substate.type = KVM_VCPU_STATE_FPU;
+		substate.offset = 0;
+		r = kvm_vcpu_get_substate(vcpu, argp, &substate);
 		break;
-	}
-	case KVM_SET_FPU: {
-		fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!fpu)
-			goto out;
+	case KVM_SET_FPU:
+		substate.type = KVM_VCPU_STATE_FPU;
+		substate.offset = 0;
+		r = kvm_vcpu_set_substate(vcpu, argp, &substate);
+		break;
+	case KVM_GET_VCPU_STATE:
+	case KVM_SET_VCPU_STATE: {
+		struct kvm_vcpu_state __user *user_head = argp;
+		struct kvm_vcpu_substate *substates = NULL;
+		uint8_t __user *arg_base = argp;
+		struct kvm_vcpu_state head;
+		size_t size;
+		int i;
+
 		r = -EFAULT;
-		if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
-			goto out;
-		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
-		if (r)
-			goto out;
+		if (copy_from_user(&head, user_head,
+				   sizeof(struct kvm_vcpu_state)))
+			break;
+
+		head.nprocessed = 0;
+
+		size = head.nsubstates * sizeof(struct kvm_vcpu_substate);
+		if (head.nsubstates <= 1) {
+			substates = &substate;
+		} else {
+			r = -E2BIG;
+			if (head.nsubstates > KVM_MAX_VCPU_SUBSTATES)
+				goto vcpu_state_out;
+
+			substates = kmalloc(size, GFP_KERNEL);
+			r = -ENOMEM;
+			if (!substates)
+				goto vcpu_state_out;
+		}
+
+		r = -EFAULT;
+		if (copy_from_user(substates, user_head->substates, size))
+			goto vcpu_state_out;
+
+		for (i = 0; i < head.nsubstates; i++) {
+			if (ioctl == KVM_GET_VCPU_STATE)
+				r = kvm_vcpu_get_substate(vcpu, arg_base,
+							  &substates[i]);
+			else
+				r = kvm_vcpu_set_substate(vcpu, arg_base,
+							  &substates[i]);
+			if (r < 0)
+				goto vcpu_state_out;
+			head.nprocessed++;
+		}
 		r = 0;
+vcpu_state_out:
+		if (copy_to_user(&user_head->nprocessed, &head.nprocessed,
+				 sizeof(head.nprocessed)))
+			r = -EFAULT;
+		if (head.nsubstates > 1)
+			kfree(substates);
 		break;
 	}
 	default:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}
-out:
-	kfree(fpu);
-	kfree(kvm_sregs);
 	return r;
 }
 
@@ -1650,6 +1783,7 @@  static long kvm_dev_ioctl_check_extension_generic(long arg)
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+	case KVM_CAP_VCPU_STATE:
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
@@ -1664,6 +1798,19 @@  static long kvm_dev_ioctl_check_extension_generic(long arg)
 	return kvm_dev_ioctl_check_extension(arg);
 }
 
+static bool kvm_check_substate(u32 type)
+{
+	switch (type) {
+	case KVM_VCPU_STATE_REGS:
+	case KVM_VCPU_STATE_SREGS:
+	case KVM_VCPU_STATE_FPU:
+	case KVM_VCPU_STATE_MP:
+		return true;
+	default:
+		return kvm_arch_check_substate(type);
+	}
+}
+
 static long kvm_dev_ioctl(struct file *filp,
 			  unsigned int ioctl, unsigned long arg)
 {
@@ -1702,6 +1849,42 @@  static long kvm_dev_ioctl(struct file *filp,
 	case KVM_TRACE_DISABLE:
 		r = -EOPNOTSUPP;
 		break;
+	case KVM_CHECK_VCPU_STATES: {
+		struct kvm_vcpu_state_list __user *user_head = (void *)arg;
+		struct kvm_vcpu_state_list head;
+		__u32 *types;
+		size_t size;
+		int i;
+
+		r = -EFAULT;
+		if (copy_from_user(&head, user_head,
+				   sizeof(struct kvm_vcpu_state_list)))
+			break;
+
+		r = -E2BIG;
+		if (head.ntypes > KVM_MAX_VCPU_SUBSTATES)
+			break;
+
+		size = head.ntypes * sizeof(__u32);
+		types = kmalloc(size, GFP_KERNEL);
+		r = -ENOMEM;
+		if (!types)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(types, user_head->types, size))
+			goto vcpu_state_out;
+
+		for (i = 0; i < head.ntypes; i++)
+			if (!kvm_check_substate(types[i]))
+				types[i] = KVM_VCPU_STATE_INVALID;
+		r = 0;
+vcpu_state_out:
+		if (copy_to_user(&user_head->types, types, size))
+			r = -EFAULT;
+		kfree(types);
+		break;
+	}
 	default:
 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
 	}