diff mbox

[v4,2/2] KVM: x86: KVM_CAP_SYNC_REGS

Message ID 20180201000336.154170-3-hofsass@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ken Hofsass Feb. 1, 2018, 12:03 a.m. UTC
This commit implements an enhanced x86 version of S390
KVM_CAP_SYNC_REGS functionality. KVM_CAP_SYNC_REGS "allow[s]
userspace to access certain guest registers without having
to call SET/GET_*REGS”. This reduces ioctl overhead which
is particularly important when userspace is making synchronous
guest state modifications (e.g. when emulating and/or intercepting
instructions).

Originally implemented upstream for the S390, the x86 differences
follow:
- userspace can select the register sets to be synchronized with kvm_run
using bit-flags in the kvm_valid_registers and kvm_dirty_registers
fields.
- vcpu_events is available in addition to the regs and sregs register
sets.

Signed-off-by: Ken Hofsass <hofsass@google.com>
---
 Documentation/virtual/kvm/api.txt |  40 ++++++++++++++
 arch/x86/include/uapi/asm/kvm.h   |  19 ++++++-
 arch/x86/kvm/x86.c                | 108 +++++++++++++++++++++++++++++++++-----
 3 files changed, 152 insertions(+), 15 deletions(-)

Comments

David Hildenbrand Feb. 1, 2018, 1:05 p.m. UTC | #1
> +/* kvm_sync_regs struct included by kvm_run struct */
>  struct kvm_sync_regs {
> +	/* Members of this structure are potentially malicious.
> +	 * Care must be taken by code reading, esp. interpreting,
> +	 * data fields from them inside KVM to prevent TOCTOU and
> +	 * double-fetch types of vulnerabilities.
> +	 */

Indeed, did you check if the set functions are safe?

> +	struct kvm_regs regs;
> +	struct kvm_sregs sregs;
> +	struct kvm_vcpu_events events;
>  };
>  
>  #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0204b2b8a293..ffd2c4e5d245 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -100,6 +100,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
>  static void process_nmi(struct kvm_vcpu *vcpu);
>  static void enter_smm(struct kvm_vcpu *vcpu);
>  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
> +static void store_regs(struct kvm_vcpu *vcpu);
> +static int sync_regs(struct kvm_vcpu *vcpu);
> +static int check_valid_regs(struct kvm_vcpu *vcpu);
>  
>  struct kvm_x86_ops *kvm_x86_ops __read_mostly;
>  EXPORT_SYMBOL_GPL(kvm_x86_ops);
> @@ -2743,6 +2746,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_IMMEDIATE_EXIT:
>  		r = 1;
>  		break;
> +	case KVM_CAP_SYNC_REGS:
> +		r = KVM_SYNC_X86_VALID_FIELDS;
> +		break;
>  	case KVM_CAP_ADJUST_CLOCK:
>  		r = KVM_CLOCK_TSC_STABLE;
>  		break;
> @@ -7325,6 +7331,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
>  	return 0;
>  }
>  
> +static int check_valid_regs(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS)
> +		return -EINVAL;
> +	return 0;
> +}
>  
>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  {
> @@ -7351,6 +7363,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  		goto out;
>  	}
>  
> +	if (vcpu->run->kvm_valid_regs) {

You can check directly, without this if. (and not sure if the extra
function is needed here).

> +		r = check_valid_regs(vcpu);
> +		if (r != 0)
> +			goto out;
> +	}
> +	if (vcpu->run->kvm_dirty_regs) {
> +		r = sync_regs(vcpu);
> +		if (r != 0)
> +			goto out;
> +	}
> +
>  	/* re-sync apic's tpr */
>  	if (!lapic_in_kernel(vcpu)) {
>  		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
> @@ -7375,6 +7398,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  

Looks good to me

Reviewed-by: David Hildenbrand <david@redhat.com>
Radim Krčmář Feb. 2, 2018, 9:04 p.m. UTC | #2
2018-02-01 14:05+0100, David Hildenbrand:
> > @@ -7351,6 +7363,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> >  		goto out;
> >  	}
> >  
> > +	if (vcpu->run->kvm_valid_regs) {
> 
> You can check directly, without this if. (and not sure if the extra
> function is needed here).

Right, it's unlikely that we'll be doing more checks on kvm_valid_regs,
so I've moved it here and replaced the block with

  if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
  	r = -EINVAL;
  	goto out;
  }

You can see the result in kvm/queue.

Btw. the userspace doesn't know what the next userspace exit is going to
be, so it the purpose of kvm_valid_regs to avoid needless writes of
registers on old userspaces?

Thanks.

> > +		r = check_valid_regs(vcpu);
> > +		if (r != 0)
> > +			goto out;
> > +	}
Ken Hofsass Feb. 2, 2018, 11:54 p.m. UTC | #3
Radim,

On Fri, Feb 2, 2018 at 1:04 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:

> Right, it's unlikely that we'll be doing more checks on kvm_valid_regs,
> so I've moved it here and replaced the block with
>
>   if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
>         r = -EINVAL;
>         goto out;
>   }
>
> You can see the result in kvm/queue.

Thanks for cleaning that up & queuing the commits!

> Btw. the userspace doesn't know what the next userspace exit is going to
> be, so it the purpose of kvm_valid_regs to avoid needless writes of
> registers on old userspaces?

If you mean userspace agents that don't have code to use SYNC_REGS,
then yes. Or new userspace agents that have not enabled features that
use S_R or features that only need a subset of available register
sets. Originally I had FPU and debug_regs as well and expected that
those would not be as broadly used and wanted to enable turning them
on/off.  Also, I wanted to keep the flexibility because I have related
patches extending and building on SYNC_REGS that I will be sending for
review in the near future.

thanks again,
Ken
Wanpeng Li Feb. 5, 2018, 1:29 a.m. UTC | #4
2018-02-01 8:03 GMT+08:00 Ken Hofsass <hofsass@google.com>:
> This commit implements an enhanced x86 version of S390
> KVM_CAP_SYNC_REGS functionality. KVM_CAP_SYNC_REGS "allow[s]
> userspace to access certain guest registers without having
> to call SET/GET_*REGS”. This reduces ioctl overhead which
> is particularly important when userspace is making synchronous
> guest state modifications (e.g. when emulating and/or intercepting
> instructions).
>
> Originally implemented upstream for the S390, the x86 differences
> follow:
> - userspace can select the register sets to be synchronized with kvm_run
> using bit-flags in the kvm_valid_registers and kvm_dirty_registers
> fields.
> - vcpu_events is available in addition to the regs and sregs register
> sets.
>
> Signed-off-by: Ken Hofsass <hofsass@google.com>

This commit in kvm/queue results in:

 general protection fault: 0000 [#1] PREEMPT SMP PTI
 CPU: 7 PID: 2348 Comm: qemu-system-x86 Not tainted 4.15.0+ #4
 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS
FBKTC1AUS 02/16/2016
 RIP: 0010:preempt_notifier_unregister+0x13/0x40
 RSP: 0018:ffffbe6541c07da8 EFLAGS: 00010286
 RAX: dead000000000100 RBX: ffff99ed46ca8000 RCX: 0000000037271e7b
 RDX: dead000000000200 RSI: 0000000000000000 RDI: ffff99ed46ca8008
 RBP: ffffbe6541c07da8 R08: 00000000fe0913b4 R09: 442c1f7a00000000
 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000
 R13: ffff99ed46ca8000 R14: ffff99ed46ca8058 R15: ffff99ed47635400
 FS:  00007f3c2b474700(0000) GS:ffff99ed8e200000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007f3c1c001000 CR3: 00000003c73f4006 CR4: 00000000001626e0
 Call Trace:
  vcpu_put+0x28/0x50 [kvm]
  kvm_arch_vcpu_ioctl_set_sregs+0x2d/0x40 [kvm]
  kvm_vcpu_ioctl+0x54a/0x720 [kvm]
  ? __fget+0xfc/0x210
  ? __fget+0xfc/0x210
  do_vfs_ioctl+0xa4/0x6a0
  ? __fget+0x11d/0x210
  SyS_ioctl+0x79/0x90
  entry_SYSCALL_64_fastpath+0x25/0x9c
 RIP: 0033:0x7f3c36fd0f47
 RSP: 002b:00007f3c2b473668 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
 RAX: ffffffffffffffda RBX: 0000555a11f78c10 RCX: 00007f3c36fd0f47
 RDX: 00007f3c2b473710 RSI: 000000004138ae84 RDI: 000000000000000f
 RBP: 0000000000000003 R08: 0000555a107825b0 R09: 00007f3c1c001000
 R10: 00007f3c1c003010 R11: 0000000000000246 R12: 00007f3c2b473870
 R13: 00007f3c1c001000 R14: 00007f3c2b4749c0 R15: 0000555a11f78c10
 RIP: preempt_notifier_unregister+0x13/0x40 RSP: ffffbe6541c07da8
 ---[ end trace 318a26af2b7ef2bc ]---

Regards,
Wanpeng Li

> ---
>  Documentation/virtual/kvm/api.txt |  40 ++++++++++++++
>  arch/x86/include/uapi/asm/kvm.h   |  19 ++++++-
>  arch/x86/kvm/x86.c                | 108 +++++++++++++++++++++++++++++++++-----
>  3 files changed, 152 insertions(+), 15 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index de55fe173afe..97bc2d0e69aa 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4014,6 +4014,46 @@ Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
>  accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
>  the guest.
>
> +6.74 KVM_CAP_SYNC_REGS
> +Architectures: s390, x86
> +Target: s390: always enabled, x86: vcpu
> +Parameters: none
> +Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register
> +sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
> +
> +As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
> +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
> +without having to call SET/GET_*REGS". This reduces overhead by eliminating
> +repeated ioctl calls for setting and/or getting register values. This is
> +particularly important when userspace is making synchronous guest state
> +modifications, e.g. when emulating and/or intercepting instructions in
> +userspace.
> +
> +For s390 specifics, please refer to the source code.
> +
> +For x86:
> +- the register sets to be copied out to kvm_run are selectable
> +  by userspace (rather that all sets being copied out for every exit).
> +- vcpu_events are available in addition to regs and sregs.
> +
> +For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to
> +function as an input bit-array field set by userspace to indicate the
> +specific register sets to be copied out on the next exit.
> +
> +To indicate when userspace has modified values that should be copied into
> +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
> +This is done using the same bitflags as for the 'kvm_valid_regs' field.
> +If the dirty bit is not set, then the register set values will not be copied
> +into the vCPU even if they've been modified.
> +
> +Unused bitfields in the bitarrays must be set to zero.
> +
> +struct kvm_sync_regs {
> +        struct kvm_regs regs;
> +        struct kvm_sregs sregs;
> +        struct kvm_vcpu_events events;
> +};
> +
>  7. Capabilities that can be enabled on VMs
>  ------------------------------------------
>
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index f3a960488eae..c535c2fdea13 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -354,8 +354,25 @@ struct kvm_xcrs {
>         __u64 padding[16];
>  };
>
> -/* definition of registers in kvm_run */
> +#define KVM_SYNC_X86_REGS      (1UL << 0)
> +#define KVM_SYNC_X86_SREGS     (1UL << 1)
> +#define KVM_SYNC_X86_EVENTS    (1UL << 2)
> +
> +#define KVM_SYNC_X86_VALID_FIELDS \
> +       (KVM_SYNC_X86_REGS| \
> +        KVM_SYNC_X86_SREGS| \
> +        KVM_SYNC_X86_EVENTS)
> +
> +/* kvm_sync_regs struct included by kvm_run struct */
>  struct kvm_sync_regs {
> +       /* Members of this structure are potentially malicious.
> +        * Care must be taken by code reading, esp. interpreting,
> +        * data fields from them inside KVM to prevent TOCTOU and
> +        * double-fetch types of vulnerabilities.
> +        */
> +       struct kvm_regs regs;
> +       struct kvm_sregs sregs;
> +       struct kvm_vcpu_events events;
>  };
>
>  #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0204b2b8a293..ffd2c4e5d245 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -100,6 +100,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
>  static void process_nmi(struct kvm_vcpu *vcpu);
>  static void enter_smm(struct kvm_vcpu *vcpu);
>  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
> +static void store_regs(struct kvm_vcpu *vcpu);
> +static int sync_regs(struct kvm_vcpu *vcpu);
> +static int check_valid_regs(struct kvm_vcpu *vcpu);
>
>  struct kvm_x86_ops *kvm_x86_ops __read_mostly;
>  EXPORT_SYMBOL_GPL(kvm_x86_ops);
> @@ -2743,6 +2746,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>         case KVM_CAP_IMMEDIATE_EXIT:
>                 r = 1;
>                 break;
> +       case KVM_CAP_SYNC_REGS:
> +               r = KVM_SYNC_X86_VALID_FIELDS;
> +               break;
>         case KVM_CAP_ADJUST_CLOCK:
>                 r = KVM_CLOCK_TSC_STABLE;
>                 break;
> @@ -7325,6 +7331,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
>         return 0;
>  }
>
> +static int check_valid_regs(struct kvm_vcpu *vcpu)
> +{
> +       if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS)
> +               return -EINVAL;
> +       return 0;
> +}
>
>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  {
> @@ -7351,6 +7363,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>                 goto out;
>         }
>
> +       if (vcpu->run->kvm_valid_regs) {
> +               r = check_valid_regs(vcpu);
> +               if (r != 0)
> +                       goto out;
> +       }
> +       if (vcpu->run->kvm_dirty_regs) {
> +               r = sync_regs(vcpu);
> +               if (r != 0)
> +                       goto out;
> +       }
> +
>         /* re-sync apic's tpr */
>         if (!lapic_in_kernel(vcpu)) {
>                 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
> @@ -7375,6 +7398,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>
>  out:
>         kvm_put_guest_fpu(vcpu);
> +       if (vcpu->run->kvm_valid_regs)
> +               store_regs(vcpu);
>         post_kvm_run_save(vcpu);
>         kvm_sigset_deactivate(vcpu);
>
> @@ -7382,10 +7407,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>         return r;
>  }
>
> -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> +static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>  {
> -       vcpu_load(vcpu);
> -
>         if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
>                 /*
>                  * We are here if userspace calls get_regs() in the middle of
> @@ -7418,15 +7441,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>
>         regs->rip = kvm_rip_read(vcpu);
>         regs->rflags = kvm_get_rflags(vcpu);
> +}
>
> +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> +{
> +       vcpu_load(vcpu);
> +       __get_regs(vcpu, regs);
>         vcpu_put(vcpu);
>         return 0;
>  }
>
> -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> +static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>  {
> -       vcpu_load(vcpu);
> -
>         vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
>         vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
>
> @@ -7455,7 +7481,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>         vcpu->arch.exception.pending = false;
>
>         kvm_make_request(KVM_REQ_EVENT, vcpu);
> +}
>
> +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> +{
> +       vcpu_load(vcpu);
> +       __set_regs(vcpu, regs);
>         vcpu_put(vcpu);
>         return 0;
>  }
> @@ -7470,13 +7501,10 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
>  }
>  EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
>
> -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> -                                 struct kvm_sregs *sregs)
> +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
>  {
>         struct desc_ptr dt;
>
> -       vcpu_load(vcpu);
> -
>         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
>         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
>         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
> @@ -7507,7 +7535,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
>                 set_bit(vcpu->arch.interrupt.nr,
>                         (unsigned long *)sregs->interrupt_bitmap);
> +}
>
> +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> +                                 struct kvm_sregs *sregs)
> +{
> +       vcpu_load(vcpu);
> +       __get_sregs(vcpu, sregs);
>         vcpu_put(vcpu);
>         return 0;
>  }
> @@ -7579,8 +7613,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
>  }
>  EXPORT_SYMBOL_GPL(kvm_task_switch);
>
> -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> -                                 struct kvm_sregs *sregs)
> +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
>  {
>         struct msr_data apic_base_msr;
>         int mmu_reset_needed = 0;
> @@ -7588,8 +7621,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>         struct desc_ptr dt;
>         int ret = -EINVAL;
>
> -       vcpu_load(vcpu);
> -
>         if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
>                         (sregs->cr4 & X86_CR4_OSXSAVE))
>                 goto out;
> @@ -7665,6 +7696,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>
>         ret = 0;
>  out:
> +       return ret;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> +                                 struct kvm_sregs *sregs)
> +{
> +       int ret;
> +
> +       vcpu_load(vcpu);
> +       ret = __set_sregs(vcpu, sregs);
>         vcpu_put(vcpu);
>         return ret;
>  }
> @@ -7791,6 +7832,45 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
>         return 0;
>  }
>
> +static void store_regs(struct kvm_vcpu *vcpu)
> +{
> +       BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
> +
> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
> +               __get_regs(vcpu, &vcpu->run->s.regs.regs);
> +
> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
> +               __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
> +
> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
> +               kvm_vcpu_ioctl_x86_get_vcpu_events(
> +                               vcpu, &vcpu->run->s.regs.events);
> +}
> +
> +static int sync_regs(struct kvm_vcpu *vcpu)
> +{
> +       if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
> +               return -EINVAL;
> +
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
> +               __set_regs(vcpu, &vcpu->run->s.regs.regs);
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
> +               if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
> +                       return -EINVAL;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
> +               if (kvm_vcpu_ioctl_x86_set_vcpu_events(
> +                               vcpu, &vcpu->run->s.regs.events))
> +                       return -EINVAL;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
> +       }
> +
> +       return 0;
> +}
> +
>  static void fx_init(struct kvm_vcpu *vcpu)
>  {
>         fpstate_init(&vcpu->arch.guest_fpu.state);
> --
> 2.16.0.rc1.238.g530d649a79-goog
>
Ken Hofsass Feb. 6, 2018, 12:32 a.m. UTC | #5
Hi Wanpeng, that's disturbing! Could you give me more context? Did
this occur while running part of the KVM unit tests? Does this happen
every time? Do you know if this is this the first invocation of
kvm_arch_vcpu_ioctl_set_sregs? When I ran against kvm-unit-tests
w/qemu, I didn't hit this. Nor when I ran our other internal tests.

thanks,
Ken Hofsass


On Sun, Feb 4, 2018 at 5:29 PM, Wanpeng Li <kernellwp@gmail.com> wrote:
> 2018-02-01 8:03 GMT+08:00 Ken Hofsass <hofsass@google.com>:
>> This commit implements an enhanced x86 version of S390
>> KVM_CAP_SYNC_REGS functionality. KVM_CAP_SYNC_REGS "allow[s]
>> userspace to access certain guest registers without having
>> to call SET/GET_*REGS”. This reduces ioctl overhead which
>> is particularly important when userspace is making synchronous
>> guest state modifications (e.g. when emulating and/or intercepting
>> instructions).
>>
>> Originally implemented upstream for the S390, the x86 differences
>> follow:
>> - userspace can select the register sets to be synchronized with kvm_run
>> using bit-flags in the kvm_valid_registers and kvm_dirty_registers
>> fields.
>> - vcpu_events is available in addition to the regs and sregs register
>> sets.
>>
>> Signed-off-by: Ken Hofsass <hofsass@google.com>
>
> This commit in kvm/queue results in:
>
>  general protection fault: 0000 [#1] PREEMPT SMP PTI
>  CPU: 7 PID: 2348 Comm: qemu-system-x86 Not tainted 4.15.0+ #4
>  Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS
> FBKTC1AUS 02/16/2016
>  RIP: 0010:preempt_notifier_unregister+0x13/0x40
>  RSP: 0018:ffffbe6541c07da8 EFLAGS: 00010286
>  RAX: dead000000000100 RBX: ffff99ed46ca8000 RCX: 0000000037271e7b
>  RDX: dead000000000200 RSI: 0000000000000000 RDI: ffff99ed46ca8008
>  RBP: ffffbe6541c07da8 R08: 00000000fe0913b4 R09: 442c1f7a00000000
>  R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000
>  R13: ffff99ed46ca8000 R14: ffff99ed46ca8058 R15: ffff99ed47635400
>  FS:  00007f3c2b474700(0000) GS:ffff99ed8e200000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: 00007f3c1c001000 CR3: 00000003c73f4006 CR4: 00000000001626e0
>  Call Trace:
>   vcpu_put+0x28/0x50 [kvm]
>   kvm_arch_vcpu_ioctl_set_sregs+0x2d/0x40 [kvm]
>   kvm_vcpu_ioctl+0x54a/0x720 [kvm]
>   ? __fget+0xfc/0x210
>   ? __fget+0xfc/0x210
>   do_vfs_ioctl+0xa4/0x6a0
>   ? __fget+0x11d/0x210
>   SyS_ioctl+0x79/0x90
>   entry_SYSCALL_64_fastpath+0x25/0x9c
>  RIP: 0033:0x7f3c36fd0f47
>  RSP: 002b:00007f3c2b473668 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
>  RAX: ffffffffffffffda RBX: 0000555a11f78c10 RCX: 00007f3c36fd0f47
>  RDX: 00007f3c2b473710 RSI: 000000004138ae84 RDI: 000000000000000f
>  RBP: 0000000000000003 R08: 0000555a107825b0 R09: 00007f3c1c001000
>  R10: 00007f3c1c003010 R11: 0000000000000246 R12: 00007f3c2b473870
>  R13: 00007f3c1c001000 R14: 00007f3c2b4749c0 R15: 0000555a11f78c10
>  RIP: preempt_notifier_unregister+0x13/0x40 RSP: ffffbe6541c07da8
>  ---[ end trace 318a26af2b7ef2bc ]---
>
> Regards,
> Wanpeng Li
>
>> ---
>>  Documentation/virtual/kvm/api.txt |  40 ++++++++++++++
>>  arch/x86/include/uapi/asm/kvm.h   |  19 ++++++-
>>  arch/x86/kvm/x86.c                | 108 +++++++++++++++++++++++++++++++++-----
>>  3 files changed, 152 insertions(+), 15 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index de55fe173afe..97bc2d0e69aa 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -4014,6 +4014,46 @@ Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
>>  accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
>>  the guest.
>>
>> +6.74 KVM_CAP_SYNC_REGS
>> +Architectures: s390, x86
>> +Target: s390: always enabled, x86: vcpu
>> +Parameters: none
>> +Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register
>> +sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
>> +
>> +As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
>> +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
>> +without having to call SET/GET_*REGS". This reduces overhead by eliminating
>> +repeated ioctl calls for setting and/or getting register values. This is
>> +particularly important when userspace is making synchronous guest state
>> +modifications, e.g. when emulating and/or intercepting instructions in
>> +userspace.
>> +
>> +For s390 specifics, please refer to the source code.
>> +
>> +For x86:
>> +- the register sets to be copied out to kvm_run are selectable
>> +  by userspace (rather that all sets being copied out for every exit).
>> +- vcpu_events are available in addition to regs and sregs.
>> +
>> +For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to
>> +function as an input bit-array field set by userspace to indicate the
>> +specific register sets to be copied out on the next exit.
>> +
>> +To indicate when userspace has modified values that should be copied into
>> +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
>> +This is done using the same bitflags as for the 'kvm_valid_regs' field.
>> +If the dirty bit is not set, then the register set values will not be copied
>> +into the vCPU even if they've been modified.
>> +
>> +Unused bitfields in the bitarrays must be set to zero.
>> +
>> +struct kvm_sync_regs {
>> +        struct kvm_regs regs;
>> +        struct kvm_sregs sregs;
>> +        struct kvm_vcpu_events events;
>> +};
>> +
>>  7. Capabilities that can be enabled on VMs
>>  ------------------------------------------
>>
>> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
>> index f3a960488eae..c535c2fdea13 100644
>> --- a/arch/x86/include/uapi/asm/kvm.h
>> +++ b/arch/x86/include/uapi/asm/kvm.h
>> @@ -354,8 +354,25 @@ struct kvm_xcrs {
>>         __u64 padding[16];
>>  };
>>
>> -/* definition of registers in kvm_run */
>> +#define KVM_SYNC_X86_REGS      (1UL << 0)
>> +#define KVM_SYNC_X86_SREGS     (1UL << 1)
>> +#define KVM_SYNC_X86_EVENTS    (1UL << 2)
>> +
>> +#define KVM_SYNC_X86_VALID_FIELDS \
>> +       (KVM_SYNC_X86_REGS| \
>> +        KVM_SYNC_X86_SREGS| \
>> +        KVM_SYNC_X86_EVENTS)
>> +
>> +/* kvm_sync_regs struct included by kvm_run struct */
>>  struct kvm_sync_regs {
>> +       /* Members of this structure are potentially malicious.
>> +        * Care must be taken by code reading, esp. interpreting,
>> +        * data fields from them inside KVM to prevent TOCTOU and
>> +        * double-fetch types of vulnerabilities.
>> +        */
>> +       struct kvm_regs regs;
>> +       struct kvm_sregs sregs;
>> +       struct kvm_vcpu_events events;
>>  };
>>
>>  #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 0204b2b8a293..ffd2c4e5d245 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -100,6 +100,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
>>  static void process_nmi(struct kvm_vcpu *vcpu);
>>  static void enter_smm(struct kvm_vcpu *vcpu);
>>  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
>> +static void store_regs(struct kvm_vcpu *vcpu);
>> +static int sync_regs(struct kvm_vcpu *vcpu);
>> +static int check_valid_regs(struct kvm_vcpu *vcpu);
>>
>>  struct kvm_x86_ops *kvm_x86_ops __read_mostly;
>>  EXPORT_SYMBOL_GPL(kvm_x86_ops);
>> @@ -2743,6 +2746,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>         case KVM_CAP_IMMEDIATE_EXIT:
>>                 r = 1;
>>                 break;
>> +       case KVM_CAP_SYNC_REGS:
>> +               r = KVM_SYNC_X86_VALID_FIELDS;
>> +               break;
>>         case KVM_CAP_ADJUST_CLOCK:
>>                 r = KVM_CLOCK_TSC_STABLE;
>>                 break;
>> @@ -7325,6 +7331,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
>>         return 0;
>>  }
>>
>> +static int check_valid_regs(struct kvm_vcpu *vcpu)
>> +{
>> +       if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS)
>> +               return -EINVAL;
>> +       return 0;
>> +}
>>
>>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>>  {
>> @@ -7351,6 +7363,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>>                 goto out;
>>         }
>>
>> +       if (vcpu->run->kvm_valid_regs) {
>> +               r = check_valid_regs(vcpu);
>> +               if (r != 0)
>> +                       goto out;
>> +       }
>> +       if (vcpu->run->kvm_dirty_regs) {
>> +               r = sync_regs(vcpu);
>> +               if (r != 0)
>> +                       goto out;
>> +       }
>> +
>>         /* re-sync apic's tpr */
>>         if (!lapic_in_kernel(vcpu)) {
>>                 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
>> @@ -7375,6 +7398,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>>
>>  out:
>>         kvm_put_guest_fpu(vcpu);
>> +       if (vcpu->run->kvm_valid_regs)
>> +               store_regs(vcpu);
>>         post_kvm_run_save(vcpu);
>>         kvm_sigset_deactivate(vcpu);
>>
>> @@ -7382,10 +7407,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>>         return r;
>>  }
>>
>> -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>> +static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>  {
>> -       vcpu_load(vcpu);
>> -
>>         if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
>>                 /*
>>                  * We are here if userspace calls get_regs() in the middle of
>> @@ -7418,15 +7441,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>
>>         regs->rip = kvm_rip_read(vcpu);
>>         regs->rflags = kvm_get_rflags(vcpu);
>> +}
>>
>> +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>> +{
>> +       vcpu_load(vcpu);
>> +       __get_regs(vcpu, regs);
>>         vcpu_put(vcpu);
>>         return 0;
>>  }
>>
>> -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>> +static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>  {
>> -       vcpu_load(vcpu);
>> -
>>         vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
>>         vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
>>
>> @@ -7455,7 +7481,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>         vcpu->arch.exception.pending = false;
>>
>>         kvm_make_request(KVM_REQ_EVENT, vcpu);
>> +}
>>
>> +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>> +{
>> +       vcpu_load(vcpu);
>> +       __set_regs(vcpu, regs);
>>         vcpu_put(vcpu);
>>         return 0;
>>  }
>> @@ -7470,13 +7501,10 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
>>
>> -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>> -                                 struct kvm_sregs *sregs)
>> +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
>>  {
>>         struct desc_ptr dt;
>>
>> -       vcpu_load(vcpu);
>> -
>>         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
>>         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
>>         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
>> @@ -7507,7 +7535,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>>         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
>>                 set_bit(vcpu->arch.interrupt.nr,
>>                         (unsigned long *)sregs->interrupt_bitmap);
>> +}
>>
>> +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>> +                                 struct kvm_sregs *sregs)
>> +{
>> +       vcpu_load(vcpu);
>> +       __get_sregs(vcpu, sregs);
>>         vcpu_put(vcpu);
>>         return 0;
>>  }
>> @@ -7579,8 +7613,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_task_switch);
>>
>> -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>> -                                 struct kvm_sregs *sregs)
>> +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
>>  {
>>         struct msr_data apic_base_msr;
>>         int mmu_reset_needed = 0;
>> @@ -7588,8 +7621,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>>         struct desc_ptr dt;
>>         int ret = -EINVAL;
>>
>> -       vcpu_load(vcpu);
>> -
>>         if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
>>                         (sregs->cr4 & X86_CR4_OSXSAVE))
>>                 goto out;
>> @@ -7665,6 +7696,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>>
>>         ret = 0;
>>  out:
>> +       return ret;
>> +}
>> +
>> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>> +                                 struct kvm_sregs *sregs)
>> +{
>> +       int ret;
>> +
>> +       vcpu_load(vcpu);
>> +       ret = __set_sregs(vcpu, sregs);
>>         vcpu_put(vcpu);
>>         return ret;
>>  }
>> @@ -7791,6 +7832,45 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
>>         return 0;
>>  }
>>
>> +static void store_regs(struct kvm_vcpu *vcpu)
>> +{
>> +       BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
>> +
>> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
>> +               __get_regs(vcpu, &vcpu->run->s.regs.regs);
>> +
>> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
>> +               __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
>> +
>> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
>> +               kvm_vcpu_ioctl_x86_get_vcpu_events(
>> +                               vcpu, &vcpu->run->s.regs.events);
>> +}
>> +
>> +static int sync_regs(struct kvm_vcpu *vcpu)
>> +{
>> +       if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
>> +               return -EINVAL;
>> +
>> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
>> +               __set_regs(vcpu, &vcpu->run->s.regs.regs);
>> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
>> +       }
>> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
>> +               if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
>> +                       return -EINVAL;
>> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
>> +       }
>> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
>> +               if (kvm_vcpu_ioctl_x86_set_vcpu_events(
>> +                               vcpu, &vcpu->run->s.regs.events))
>> +                       return -EINVAL;
>> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>>  static void fx_init(struct kvm_vcpu *vcpu)
>>  {
>>         fpstate_init(&vcpu->arch.guest_fpu.state);
>> --
>> 2.16.0.rc1.238.g530d649a79-goog
>>
Wanpeng Li Feb. 6, 2018, 12:46 a.m. UTC | #6
2018-02-06 8:32 GMT+08:00 Ken Hofsass <hofsass@google.com>:
> Hi Wanpeng, that's disturbing! Could you give me more context? Did
> this occur while running part of the KVM unit tests? Does this happen
> every time? Do you know if this is this the first invocation of
> kvm_arch_vcpu_ioctl_set_sregs? When I ran against kvm-unit-tests
> w/qemu, I didn't hit this. Nor when I ran our other internal tests.

I just run a kvm guest (ubuntu 16.04 4.15-rc3) against kvm/queue w/
CONFIG_PREEMPT enabled.

Regards,
Wanpeng Li

> thanks,
> Ken Hofsass
>
>
> On Sun, Feb 4, 2018 at 5:29 PM, Wanpeng Li <kernellwp@gmail.com> wrote:
>> 2018-02-01 8:03 GMT+08:00 Ken Hofsass <hofsass@google.com>:
>>> This commit implements an enhanced x86 version of S390
>>> KVM_CAP_SYNC_REGS functionality. KVM_CAP_SYNC_REGS "allow[s]
>>> userspace to access certain guest registers without having
>>> to call SET/GET_*REGS”. This reduces ioctl overhead which
>>> is particularly important when userspace is making synchronous
>>> guest state modifications (e.g. when emulating and/or intercepting
>>> instructions).
>>>
>>> Originally implemented upstream for the S390, the x86 differences
>>> follow:
>>> - userspace can select the register sets to be synchronized with kvm_run
>>> using bit-flags in the kvm_valid_registers and kvm_dirty_registers
>>> fields.
>>> - vcpu_events is available in addition to the regs and sregs register
>>> sets.
>>>
>>> Signed-off-by: Ken Hofsass <hofsass@google.com>
>>
>> This commit in kvm/queue results in:
>>
>>  general protection fault: 0000 [#1] PREEMPT SMP PTI
>>  CPU: 7 PID: 2348 Comm: qemu-system-x86 Not tainted 4.15.0+ #4
>>  Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS
>> FBKTC1AUS 02/16/2016
>>  RIP: 0010:preempt_notifier_unregister+0x13/0x40
>>  RSP: 0018:ffffbe6541c07da8 EFLAGS: 00010286
>>  RAX: dead000000000100 RBX: ffff99ed46ca8000 RCX: 0000000037271e7b
>>  RDX: dead000000000200 RSI: 0000000000000000 RDI: ffff99ed46ca8008
>>  RBP: ffffbe6541c07da8 R08: 00000000fe0913b4 R09: 442c1f7a00000000
>>  R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000
>>  R13: ffff99ed46ca8000 R14: ffff99ed46ca8058 R15: ffff99ed47635400
>>  FS:  00007f3c2b474700(0000) GS:ffff99ed8e200000(0000) knlGS:0000000000000000
>>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>  CR2: 00007f3c1c001000 CR3: 00000003c73f4006 CR4: 00000000001626e0
>>  Call Trace:
>>   vcpu_put+0x28/0x50 [kvm]
>>   kvm_arch_vcpu_ioctl_set_sregs+0x2d/0x40 [kvm]
>>   kvm_vcpu_ioctl+0x54a/0x720 [kvm]
>>   ? __fget+0xfc/0x210
>>   ? __fget+0xfc/0x210
>>   do_vfs_ioctl+0xa4/0x6a0
>>   ? __fget+0x11d/0x210
>>   SyS_ioctl+0x79/0x90
>>   entry_SYSCALL_64_fastpath+0x25/0x9c
>>  RIP: 0033:0x7f3c36fd0f47
>>  RSP: 002b:00007f3c2b473668 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
>>  RAX: ffffffffffffffda RBX: 0000555a11f78c10 RCX: 00007f3c36fd0f47
>>  RDX: 00007f3c2b473710 RSI: 000000004138ae84 RDI: 000000000000000f
>>  RBP: 0000000000000003 R08: 0000555a107825b0 R09: 00007f3c1c001000
>>  R10: 00007f3c1c003010 R11: 0000000000000246 R12: 00007f3c2b473870
>>  R13: 00007f3c1c001000 R14: 00007f3c2b4749c0 R15: 0000555a11f78c10
>>  RIP: preempt_notifier_unregister+0x13/0x40 RSP: ffffbe6541c07da8
>>  ---[ end trace 318a26af2b7ef2bc ]---
>>
>> Regards,
>> Wanpeng Li
>>
>>> ---
>>>  Documentation/virtual/kvm/api.txt |  40 ++++++++++++++
>>>  arch/x86/include/uapi/asm/kvm.h   |  19 ++++++-
>>>  arch/x86/kvm/x86.c                | 108 +++++++++++++++++++++++++++++++++-----
>>>  3 files changed, 152 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>> index de55fe173afe..97bc2d0e69aa 100644
>>> --- a/Documentation/virtual/kvm/api.txt
>>> +++ b/Documentation/virtual/kvm/api.txt
>>> @@ -4014,6 +4014,46 @@ Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
>>>  accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
>>>  the guest.
>>>
>>> +6.74 KVM_CAP_SYNC_REGS
>>> +Architectures: s390, x86
>>> +Target: s390: always enabled, x86: vcpu
>>> +Parameters: none
>>> +Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register
>>> +sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
>>> +
>>> +As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
>>> +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
>>> +without having to call SET/GET_*REGS". This reduces overhead by eliminating
>>> +repeated ioctl calls for setting and/or getting register values. This is
>>> +particularly important when userspace is making synchronous guest state
>>> +modifications, e.g. when emulating and/or intercepting instructions in
>>> +userspace.
>>> +
>>> +For s390 specifics, please refer to the source code.
>>> +
>>> +For x86:
>>> +- the register sets to be copied out to kvm_run are selectable
>>> +  by userspace (rather that all sets being copied out for every exit).
>>> +- vcpu_events are available in addition to regs and sregs.
>>> +
>>> +For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to
>>> +function as an input bit-array field set by userspace to indicate the
>>> +specific register sets to be copied out on the next exit.
>>> +
>>> +To indicate when userspace has modified values that should be copied into
>>> +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
>>> +This is done using the same bitflags as for the 'kvm_valid_regs' field.
>>> +If the dirty bit is not set, then the register set values will not be copied
>>> +into the vCPU even if they've been modified.
>>> +
>>> +Unused bitfields in the bitarrays must be set to zero.
>>> +
>>> +struct kvm_sync_regs {
>>> +        struct kvm_regs regs;
>>> +        struct kvm_sregs sregs;
>>> +        struct kvm_vcpu_events events;
>>> +};
>>> +
>>>  7. Capabilities that can be enabled on VMs
>>>  ------------------------------------------
>>>
>>> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
>>> index f3a960488eae..c535c2fdea13 100644
>>> --- a/arch/x86/include/uapi/asm/kvm.h
>>> +++ b/arch/x86/include/uapi/asm/kvm.h
>>> @@ -354,8 +354,25 @@ struct kvm_xcrs {
>>>         __u64 padding[16];
>>>  };
>>>
>>> -/* definition of registers in kvm_run */
>>> +#define KVM_SYNC_X86_REGS      (1UL << 0)
>>> +#define KVM_SYNC_X86_SREGS     (1UL << 1)
>>> +#define KVM_SYNC_X86_EVENTS    (1UL << 2)
>>> +
>>> +#define KVM_SYNC_X86_VALID_FIELDS \
>>> +       (KVM_SYNC_X86_REGS| \
>>> +        KVM_SYNC_X86_SREGS| \
>>> +        KVM_SYNC_X86_EVENTS)
>>> +
>>> +/* kvm_sync_regs struct included by kvm_run struct */
>>>  struct kvm_sync_regs {
>>> +       /* Members of this structure are potentially malicious.
>>> +        * Care must be taken by code reading, esp. interpreting,
>>> +        * data fields from them inside KVM to prevent TOCTOU and
>>> +        * double-fetch types of vulnerabilities.
>>> +        */
>>> +       struct kvm_regs regs;
>>> +       struct kvm_sregs sregs;
>>> +       struct kvm_vcpu_events events;
>>>  };
>>>
>>>  #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index 0204b2b8a293..ffd2c4e5d245 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -100,6 +100,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
>>>  static void process_nmi(struct kvm_vcpu *vcpu);
>>>  static void enter_smm(struct kvm_vcpu *vcpu);
>>>  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
>>> +static void store_regs(struct kvm_vcpu *vcpu);
>>> +static int sync_regs(struct kvm_vcpu *vcpu);
>>> +static int check_valid_regs(struct kvm_vcpu *vcpu);
>>>
>>>  struct kvm_x86_ops *kvm_x86_ops __read_mostly;
>>>  EXPORT_SYMBOL_GPL(kvm_x86_ops);
>>> @@ -2743,6 +2746,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>>         case KVM_CAP_IMMEDIATE_EXIT:
>>>                 r = 1;
>>>                 break;
>>> +       case KVM_CAP_SYNC_REGS:
>>> +               r = KVM_SYNC_X86_VALID_FIELDS;
>>> +               break;
>>>         case KVM_CAP_ADJUST_CLOCK:
>>>                 r = KVM_CLOCK_TSC_STABLE;
>>>                 break;
>>> @@ -7325,6 +7331,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
>>>         return 0;
>>>  }
>>>
>>> +static int check_valid_regs(struct kvm_vcpu *vcpu)
>>> +{
>>> +       if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS)
>>> +               return -EINVAL;
>>> +       return 0;
>>> +}
>>>
>>>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>>>  {
>>> @@ -7351,6 +7363,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>>>                 goto out;
>>>         }
>>>
>>> +       if (vcpu->run->kvm_valid_regs) {
>>> +               r = check_valid_regs(vcpu);
>>> +               if (r != 0)
>>> +                       goto out;
>>> +       }
>>> +       if (vcpu->run->kvm_dirty_regs) {
>>> +               r = sync_regs(vcpu);
>>> +               if (r != 0)
>>> +                       goto out;
>>> +       }
>>> +
>>>         /* re-sync apic's tpr */
>>>         if (!lapic_in_kernel(vcpu)) {
>>>                 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
>>> @@ -7375,6 +7398,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>>>
>>>  out:
>>>         kvm_put_guest_fpu(vcpu);
>>> +       if (vcpu->run->kvm_valid_regs)
>>> +               store_regs(vcpu);
>>>         post_kvm_run_save(vcpu);
>>>         kvm_sigset_deactivate(vcpu);
>>>
>>> @@ -7382,10 +7407,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>>>         return r;
>>>  }
>>>
>>> -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>> +static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>>  {
>>> -       vcpu_load(vcpu);
>>> -
>>>         if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
>>>                 /*
>>>                  * We are here if userspace calls get_regs() in the middle of
>>> @@ -7418,15 +7441,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>>
>>>         regs->rip = kvm_rip_read(vcpu);
>>>         regs->rflags = kvm_get_rflags(vcpu);
>>> +}
>>>
>>> +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>> +{
>>> +       vcpu_load(vcpu);
>>> +       __get_regs(vcpu, regs);
>>>         vcpu_put(vcpu);
>>>         return 0;
>>>  }
>>>
>>> -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>> +static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>>  {
>>> -       vcpu_load(vcpu);
>>> -
>>>         vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
>>>         vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
>>>
>>> @@ -7455,7 +7481,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>>         vcpu->arch.exception.pending = false;
>>>
>>>         kvm_make_request(KVM_REQ_EVENT, vcpu);
>>> +}
>>>
>>> +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>>> +{
>>> +       vcpu_load(vcpu);
>>> +       __set_regs(vcpu, regs);
>>>         vcpu_put(vcpu);
>>>         return 0;
>>>  }
>>> @@ -7470,13 +7501,10 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
>>>  }
>>>  EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
>>>
>>> -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>>> -                                 struct kvm_sregs *sregs)
>>> +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
>>>  {
>>>         struct desc_ptr dt;
>>>
>>> -       vcpu_load(vcpu);
>>> -
>>>         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
>>>         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
>>>         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
>>> @@ -7507,7 +7535,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>>>         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
>>>                 set_bit(vcpu->arch.interrupt.nr,
>>>                         (unsigned long *)sregs->interrupt_bitmap);
>>> +}
>>>
>>> +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>>> +                                 struct kvm_sregs *sregs)
>>> +{
>>> +       vcpu_load(vcpu);
>>> +       __get_sregs(vcpu, sregs);
>>>         vcpu_put(vcpu);
>>>         return 0;
>>>  }
>>> @@ -7579,8 +7613,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
>>>  }
>>>  EXPORT_SYMBOL_GPL(kvm_task_switch);
>>>
>>> -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>>> -                                 struct kvm_sregs *sregs)
>>> +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
>>>  {
>>>         struct msr_data apic_base_msr;
>>>         int mmu_reset_needed = 0;
>>> @@ -7588,8 +7621,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>>>         struct desc_ptr dt;
>>>         int ret = -EINVAL;
>>>
>>> -       vcpu_load(vcpu);
>>> -
>>>         if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
>>>                         (sregs->cr4 & X86_CR4_OSXSAVE))
>>>                 goto out;
>>> @@ -7665,6 +7696,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>>>
>>>         ret = 0;
>>>  out:
>>> +       return ret;
>>> +}
>>> +
>>> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>>> +                                 struct kvm_sregs *sregs)
>>> +{
>>> +       int ret;
>>> +
>>> +       vcpu_load(vcpu);
>>> +       ret = __set_sregs(vcpu, sregs);
>>>         vcpu_put(vcpu);
>>>         return ret;
>>>  }
>>> @@ -7791,6 +7832,45 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
>>>         return 0;
>>>  }
>>>
>>> +static void store_regs(struct kvm_vcpu *vcpu)
>>> +{
>>> +       BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
>>> +
>>> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
>>> +               __get_regs(vcpu, &vcpu->run->s.regs.regs);
>>> +
>>> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
>>> +               __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
>>> +
>>> +       if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
>>> +               kvm_vcpu_ioctl_x86_get_vcpu_events(
>>> +                               vcpu, &vcpu->run->s.regs.events);
>>> +}
>>> +
>>> +static int sync_regs(struct kvm_vcpu *vcpu)
>>> +{
>>> +       if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
>>> +               return -EINVAL;
>>> +
>>> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
>>> +               __set_regs(vcpu, &vcpu->run->s.regs.regs);
>>> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
>>> +       }
>>> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
>>> +               if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
>>> +                       return -EINVAL;
>>> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
>>> +       }
>>> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
>>> +               if (kvm_vcpu_ioctl_x86_set_vcpu_events(
>>> +                               vcpu, &vcpu->run->s.regs.events))
>>> +                       return -EINVAL;
>>> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
>>> +       }
>>> +
>>> +       return 0;
>>> +}
>>> +
>>>  static void fx_init(struct kvm_vcpu *vcpu)
>>>  {
>>>         fpstate_init(&vcpu->arch.guest_fpu.state);
>>> --
>>> 2.16.0.rc1.238.g530d649a79-goog
>>>
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index de55fe173afe..97bc2d0e69aa 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4014,6 +4014,46 @@  Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
 accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
 the guest.
 
+6.74 KVM_CAP_SYNC_REGS
+Architectures: s390, x86
+Target: s390: always enabled, x86: vcpu
+Parameters: none
+Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register
+sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
+
+As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
+KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
+without having to call SET/GET_*REGS". This reduces overhead by eliminating
+repeated ioctl calls for setting and/or getting register values. This is
+particularly important when userspace is making synchronous guest state
+modifications, e.g. when emulating and/or intercepting instructions in
+userspace.
+
+For s390 specifics, please refer to the source code.
+
+For x86:
+- the register sets to be copied out to kvm_run are selectable
+  by userspace (rather that all sets being copied out for every exit).
+- vcpu_events are available in addition to regs and sregs.
+
+For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to
+function as an input bit-array field set by userspace to indicate the
+specific register sets to be copied out on the next exit.
+
+To indicate when userspace has modified values that should be copied into
+the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
+This is done using the same bitflags as for the 'kvm_valid_regs' field.
+If the dirty bit is not set, then the register set values will not be copied
+into the vCPU even if they've been modified.
+
+Unused bitfields in the bitarrays must be set to zero.
+
+struct kvm_sync_regs {
+        struct kvm_regs regs;
+        struct kvm_sregs sregs;
+        struct kvm_vcpu_events events;
+};
+
 7. Capabilities that can be enabled on VMs
 ------------------------------------------
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index f3a960488eae..c535c2fdea13 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -354,8 +354,25 @@  struct kvm_xcrs {
 	__u64 padding[16];
 };
 
-/* definition of registers in kvm_run */
+#define KVM_SYNC_X86_REGS      (1UL << 0)
+#define KVM_SYNC_X86_SREGS     (1UL << 1)
+#define KVM_SYNC_X86_EVENTS    (1UL << 2)
+
+#define KVM_SYNC_X86_VALID_FIELDS \
+	(KVM_SYNC_X86_REGS| \
+	 KVM_SYNC_X86_SREGS| \
+	 KVM_SYNC_X86_EVENTS)
+
+/* kvm_sync_regs struct included by kvm_run struct */
 struct kvm_sync_regs {
+	/* Members of this structure are potentially malicious.
+	 * Care must be taken by code reading, esp. interpreting,
+	 * data fields from them inside KVM to prevent TOCTOU and
+	 * double-fetch types of vulnerabilities.
+	 */
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	struct kvm_vcpu_events events;
 };
 
 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0204b2b8a293..ffd2c4e5d245 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -100,6 +100,9 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
 static void enter_smm(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void store_regs(struct kvm_vcpu *vcpu);
+static int sync_regs(struct kvm_vcpu *vcpu);
+static int check_valid_regs(struct kvm_vcpu *vcpu);
 
 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -2743,6 +2746,9 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IMMEDIATE_EXIT:
 		r = 1;
 		break;
+	case KVM_CAP_SYNC_REGS:
+		r = KVM_SYNC_X86_VALID_FIELDS;
+		break;
 	case KVM_CAP_ADJUST_CLOCK:
 		r = KVM_CLOCK_TSC_STABLE;
 		break;
@@ -7325,6 +7331,12 @@  static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int check_valid_regs(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS)
+		return -EINVAL;
+	return 0;
+}
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
@@ -7351,6 +7363,17 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
+	if (vcpu->run->kvm_valid_regs) {
+		r = check_valid_regs(vcpu);
+		if (r != 0)
+			goto out;
+	}
+	if (vcpu->run->kvm_dirty_regs) {
+		r = sync_regs(vcpu);
+		if (r != 0)
+			goto out;
+	}
+
 	/* re-sync apic's tpr */
 	if (!lapic_in_kernel(vcpu)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
@@ -7375,6 +7398,8 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 out:
 	kvm_put_guest_fpu(vcpu);
+	if (vcpu->run->kvm_valid_regs)
+		store_regs(vcpu);
 	post_kvm_run_save(vcpu);
 	kvm_sigset_deactivate(vcpu);
 
@@ -7382,10 +7407,8 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return r;
 }
 
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	vcpu_load(vcpu);
-
 	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
 		/*
 		 * We are here if userspace calls get_regs() in the middle of
@@ -7418,15 +7441,18 @@  int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	regs->rip = kvm_rip_read(vcpu);
 	regs->rflags = kvm_get_rflags(vcpu);
+}
 
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu_load(vcpu);
+	__get_regs(vcpu, regs);
 	vcpu_put(vcpu);
 	return 0;
 }
 
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	vcpu_load(vcpu);
-
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
 	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 
@@ -7455,7 +7481,12 @@  int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.exception.pending = false;
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
 
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu_load(vcpu);
+	__set_regs(vcpu, regs);
 	vcpu_put(vcpu);
 	return 0;
 }
@@ -7470,13 +7501,10 @@  void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-				  struct kvm_sregs *sregs)
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
 	struct desc_ptr dt;
 
-	vcpu_load(vcpu);
-
 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -7507,7 +7535,13 @@  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
 		set_bit(vcpu->arch.interrupt.nr,
 			(unsigned long *)sregs->interrupt_bitmap);
+}
 
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	vcpu_load(vcpu);
+	__get_sregs(vcpu, sregs);
 	vcpu_put(vcpu);
 	return 0;
 }
@@ -7579,8 +7613,7 @@  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-				  struct kvm_sregs *sregs)
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
 	struct msr_data apic_base_msr;
 	int mmu_reset_needed = 0;
@@ -7588,8 +7621,6 @@  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	struct desc_ptr dt;
 	int ret = -EINVAL;
 
-	vcpu_load(vcpu);
-
 	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
 			(sregs->cr4 & X86_CR4_OSXSAVE))
 		goto out;
@@ -7665,6 +7696,16 @@  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 	ret = 0;
 out:
+	return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	int ret;
+
+	vcpu_load(vcpu);
+	ret = __set_sregs(vcpu, sregs);
 	vcpu_put(vcpu);
 	return ret;
 }
@@ -7791,6 +7832,45 @@  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+	BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
+
+	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
+		__get_regs(vcpu, &vcpu->run->s.regs.regs);
+
+	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
+		__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+
+	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
+		kvm_vcpu_ioctl_x86_get_vcpu_events(
+				vcpu, &vcpu->run->s.regs.events);
+}
+
+static int sync_regs(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
+		return -EINVAL;
+
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+		__set_regs(vcpu, &vcpu->run->s.regs.regs);
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+	}
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+		if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
+			return -EINVAL;
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+	}
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
+		if (kvm_vcpu_ioctl_x86_set_vcpu_events(
+				vcpu, &vcpu->run->s.regs.events))
+			return -EINVAL;
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
+	}
+
+	return 0;
+}
+
 static void fx_init(struct kvm_vcpu *vcpu)
 {
 	fpstate_init(&vcpu->arch.guest_fpu.state);