diff mbox

KVM: x86: KVM_CAP_SYNC_REGS

Message ID 20170920174204.7215-1-hofsass@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ken Hofsass Sept. 20, 2017, 5:42 p.m. UTC
This commit implements an enhanced x86 version of S390
KVM_CAP_SYNC_REGS functionality. KVM_CAP_SYNC_REGS "allow[s]
userspace to access certain guest registers without having
to call SET/GET_*REGS”. This reduces ioctl overhead which
is particularly important when userspace is making synchronous
guest state modifications (e.g. when emulating and/or intercepting
instructions).

Originally implemented upstream for the S390, the x86 differences
follow:
- the capability can be enabled/disabled
- the register sets to be copied out out to kvm_run are selectable
by userspace
- vcpu_events and selectable MSRs are available in addition to the
basic register sets (regs, sregs, debug_regs, and fpu).

Signed-off-by: Ken Hofsass <hofsass@google.com>
---
 Documentation/virtual/kvm/api.txt |  49 ++++++++++++++
 arch/x86/include/uapi/asm/kvm.h   |  32 ++++++++-
 arch/x86/kvm/x86.c                | 134 +++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h          |   6 +-
 4 files changed, 216 insertions(+), 5 deletions(-)

Comments

Jim Mattson Sept. 25, 2017, 6:50 p.m. UTC | #1
Reviewed-by: Jim Mattson <jmattson@google.com>

On Wed, Sep 20, 2017 at 10:42 AM, Ken Hofsass <hofsass@google.com> wrote:
> This commit implements an enhanced x86 version of S390
> KVM_CAP_SYNC_REGS functionality. KVM_CAP_SYNC_REGS "allow[s]
> userspace to access certain guest registers without having
> to call SET/GET_*REGS”. This reduces ioctl overhead which
> is particularly important when userspace is making synchronous
> guest state modifications (e.g. when emulating and/or intercepting
> instructions).
>
> Originally implemented upstream for the S390, the x86 differences
> follow:
> - the capability can be enabled/disabled
> - the register sets to be copied out out to kvm_run are selectable
> by userspace
> - vcpu_events and selectable MSRs are available in addition to the
> basic register sets (regs, sregs, debug_regs, and fpu).
>
> Signed-off-by: Ken Hofsass <hofsass@google.com>
> ---
>  Documentation/virtual/kvm/api.txt |  49 ++++++++++++++
>  arch/x86/include/uapi/asm/kvm.h   |  32 ++++++++-
>  arch/x86/kvm/x86.c                | 134 +++++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/kvm.h          |   6 +-
>  4 files changed, 216 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index e63a35fafef0..9536eb310d04 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3951,6 +3951,55 @@ Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
>  accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
>  the guest.
>
> +6.74 KVM_CAP_SYNC_REGS
> +Architectures: s390, x86 (vmx-only)
> +Target: s390: always enabled, x86: vcpu
> +Parameters: s390: none, x86: args[0] takes bitmask selecting which register
> +sets to copy out (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
> +
> +As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
> +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
> +without having to call SET/GET_*REGS". This reduces overhead by eliminating
> +repeated ioctl calls for setting and/or getting register values. This is
> +particularly important when userspace is making synchronous guest state
> +modifications, e.g. when emulating and/or intercepting instructions in
> +userspace.
> +
> +For s390 specifics, please the source code.
> +
> +For x86:
> +- the capability can be enabled/disabled (s390 is always enabled).
> +- the register sets to be copied out out to kvm_run are selectable
> +  by userspace (rather that all sets being copied out for every exit).
> +- vcpu_events and selectable MSRs are available in addition to the
> +  basic register sets (regs, sregs, debug_regs, and fpu).
> +
> +In addition to the register sets, the x86 'kvm_sync_regs' struct definition
> +includes the bitarray, 'sync_regs', that toggles whether specific register
> +sets are to be copied out. When enabling SYNC_REGS via KVM_CAP_ENABLE, the
> +value in args[0] is copied into the 'sync_regs' bitarray value by KVM. But
> +like the register sets, the copy-out field can be modified while handling
> +an exit so that additonal KVM_CAP_ENABLE ioctl calls can be avoided.
> +
> +To indicate when userspace has modified values that should be copied into
> +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
> +This is done using the same bitflags as for the 'sync_regs' copy-out field.
> +If the dirty bit is not set, then the register values will not be copied
> +into the vCPU even if they've been modified.
> +
> +Unused bitfields in the bitarrays must be set to zero.
> +
> +struct kvm_sync_regs {
> +        __u64 sync_regs;
> +        struct kvm_regs regs;
> +        struct kvm_sregs sregs;
> +        struct kvm_debugregs debugregs;
> +        struct kvm_fpu fpu;
> +        struct kvm_vcpu_events events;
> +        struct kvm_msrs msrs;
> +        struct kvm_msr_entry msr_entries[SYNC_REGS_NUM_MSRS];
> +};
> +
>  7. Capabilities that can be enabled on VMs
>  ------------------------------------------
>
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index c2824d02ba37..cdee17a4b0ee 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -353,8 +353,38 @@ struct kvm_xcrs {
>         __u64 padding[16];
>  };
>
> -/* definition of registers in kvm_run */
> +#define KVM_SYNC_X86_REGS      (1UL << 0)
> +#define KVM_SYNC_X86_SREGS     (1UL << 1)
> +#define KVM_SYNC_X86_DEBUGREGS (1UL << 2)
> +#define KVM_SYNC_X86_FPU       (1UL << 3)
> +#define KVM_SYNC_X86_EVENTS    (1UL << 4)
> +#define KVM_SYNC_X86_MSRS      (1UL << 5)
> +#define KVM_SYNC_X86_NUM_FIELDS                6
> +
> +#define KVM_SYNC_X86_VALID_BITS \
> +       (KVM_SYNC_X86_REGS| \
> +        KVM_SYNC_X86_SREGS| \
> +        KVM_SYNC_X86_DEBUGREGS| \
> +        KVM_SYNC_X86_FPU| \
> +        KVM_SYNC_X86_EVENTS| \
> +        KVM_SYNC_X86_MSRS)
> +
> +#define SYNC_REGS_NUM_MSRS 8
> +/* kvm_sync_regs struct included by kvm_run struct */
>  struct kvm_sync_regs {
> +       /* Members of this structure are potentially malicious.
> +        * Care must be taken by code reading, esp. interpreting,
> +        * data fields from them inside KVM to prevent TOCTOU and
> +        * double-fetch types of vulnerabilities.
> +        */
> +       __u64 sync_regs;
> +       struct kvm_regs regs;
> +       struct kvm_sregs sregs;
> +       struct kvm_debugregs debugregs;
> +       struct kvm_fpu fpu;
> +       struct kvm_vcpu_events events;
> +       struct kvm_msrs msrs;
> +       struct kvm_msr_entry msr_entries[SYNC_REGS_NUM_MSRS];
>  };
>
>  #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cd17b7d9a107..ad3f84cbf7f3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -100,6 +100,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
>  static void process_nmi(struct kvm_vcpu *vcpu);
>  static void enter_smm(struct kvm_vcpu *vcpu);
>  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
> +static void sync_regs_store_to_kvmrun(struct kvm_vcpu *vcpu);
> +static int sync_regs_load_from_kvmrun(struct kvm_vcpu *vcpu);
>
>  struct kvm_x86_ops *kvm_x86_ops __read_mostly;
>  EXPORT_SYMBOL_GPL(kvm_x86_ops);
> @@ -2587,15 +2589,22 @@ EXPORT_SYMBOL_GPL(kvm_get_msr_common);
>   *
>   * @return number of msrs set successfully.
>   */
> -static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
> +static int __msr_io(struct kvm_vcpu *vcpu, u32 nmsrs, struct kvm_msrs *msrs,
>                     struct kvm_msr_entry *entries,
>                     int (*do_msr)(struct kvm_vcpu *vcpu,
>                                   unsigned index, u64 *data))
>  {
>         int i, idx;
>
> +       /* For SYNC_REGS call into __msr_io, it is possible that userspace
> +        * might have modified the count since it was snapshotted. Could be
> +        * accidental but might be intentional attempt at TOCTOU.
> +        */
> +       if (nmsrs != msrs->nmsrs)
> +               return -EINVAL;
> +
>         idx = srcu_read_lock(&vcpu->kvm->srcu);
> -       for (i = 0; i < msrs->nmsrs; ++i)
> +       for (i = 0; i < nmsrs; ++i)
>                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
>                         break;
>         srcu_read_unlock(&vcpu->kvm->srcu, idx);
> @@ -2633,7 +2642,7 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
>                 goto out;
>         }
>
> -       r = n = __msr_io(vcpu, &msrs, entries, do_msr);
> +       r = n = __msr_io(vcpu, msrs.nmsrs, &msrs, entries, do_msr);
>         if (r < 0)
>                 goto out_free;
>
> @@ -2665,6 +2674,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>         case KVM_CAP_NOP_IO_DELAY:
>         case KVM_CAP_MP_STATE:
>         case KVM_CAP_SYNC_MMU:
> +       case KVM_CAP_SYNC_REGS:
>         case KVM_CAP_USER_NMI:
>         case KVM_CAP_REINJECT_CONTROL:
>         case KVM_CAP_IRQ_INJECT_STATUS:
> @@ -3435,6 +3445,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>                         return -EINVAL;
>                 return kvm_hv_activate_synic(vcpu, cap->cap ==
>                                              KVM_CAP_HYPERV_SYNIC2);
> +       case KVM_CAP_SYNC_REGS:
> +               if (cap->args[0] & ~KVM_SYNC_X86_VALID_BITS)
> +                       return -EINVAL;
> +               vcpu->run->s.regs.sync_regs = cap->args[0];
> +               return 0;
>         default:
>                 return -EINVAL;
>         }
> @@ -7247,6 +7262,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>                 goto out;
>         }
>
> +       if (vcpu->run->kvm_dirty_regs) {
> +               r = sync_regs_load_from_kvmrun(vcpu);
> +               if (r != 0)
> +                       goto out;
> +       }
> +
>         /* re-sync apic's tpr */
>         if (!lapic_in_kernel(vcpu)) {
>                 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
> @@ -7270,6 +7291,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>                 r = vcpu_run(vcpu);
>
>  out:
> +       if (vcpu->run->s.regs.sync_regs)
> +               sync_regs_store_to_kvmrun(vcpu);
>         post_kvm_run_save(vcpu);
>         if (vcpu->sigset_active)
>                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
> @@ -7648,6 +7671,111 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
>         return 0;
>  }
>
> +static void sync_regs_store_to_kvmrun(struct kvm_vcpu *vcpu)
> +{
> +       BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_UNION_SIZE_BYTES);
> +       vcpu->run->kvm_valid_regs = 0;
> +
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_REGS) {
> +               kvm_arch_vcpu_ioctl_get_regs(vcpu, &vcpu->run->s.regs.regs);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_REGS;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_SREGS) {
> +               kvm_arch_vcpu_ioctl_get_sregs(vcpu, &vcpu->run->s.regs.sregs);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_SREGS;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_DEBUGREGS) {
> +               kvm_vcpu_ioctl_x86_get_debugregs(
> +                               vcpu, &vcpu->run->s.regs.debugregs);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_DEBUGREGS;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_FPU) {
> +               kvm_arch_vcpu_ioctl_get_fpu(vcpu, &vcpu->run->s.regs.fpu);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_FPU;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_EVENTS) {
> +               kvm_vcpu_ioctl_x86_get_vcpu_events(
> +                               vcpu, &vcpu->run->s.regs.events);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_EVENTS;
> +       }
> +       if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_MSRS) {
> +               u32 nmsrs = vcpu->run->s.regs.msrs.nmsrs;
> +
> +               if (nmsrs > SYNC_REGS_NUM_MSRS)
> +                       nmsrs = vcpu->run->s.regs.msrs.nmsrs
> +                                       = SYNC_REGS_NUM_MSRS;
> +               __msr_io(vcpu, nmsrs, &vcpu->run->s.regs.msrs,
> +                        vcpu->run->s.regs.msr_entries, do_get_msr);
> +               vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_MSRS;
> +       }
> +}
> +
> +static int sync_regs_load_from_kvmrun(struct kvm_vcpu *vcpu)
> +{
> +       int r = -EINVAL;
> +
> +       if (unlikely(vcpu->run->kvm_dirty_regs == 0))
> +               goto out;
> +
> +       if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_BITS)
> +               goto out;
> +
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
> +               if (kvm_arch_vcpu_ioctl_set_regs(
> +                               vcpu, &vcpu->run->s.regs.regs))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
> +               if (kvm_arch_vcpu_ioctl_set_sregs(
> +                               vcpu, &vcpu->run->s.regs.sregs))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_DEBUGREGS) {
> +               if (kvm_vcpu_ioctl_x86_set_debugregs(
> +                               vcpu, &vcpu->run->s.regs.debugregs))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_DEBUGREGS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_FPU) {
> +               if (kvm_arch_vcpu_ioctl_set_fpu(
> +                               vcpu, &vcpu->run->s.regs.fpu))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_FPU;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
> +               if (kvm_vcpu_ioctl_x86_set_vcpu_events(
> +                               vcpu, &vcpu->run->s.regs.events))
> +                       goto out;
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
> +       }
> +       if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_MSRS) {
> +               u32 nmsrs_in = vcpu->run->s.regs.msrs.nmsrs;
> +               u32 nmsrs_out = 0;
> +
> +               if (!nmsrs_in)
> +                       goto out;
> +               else if (nmsrs_in > SYNC_REGS_NUM_MSRS) {
> +                       r = -E2BIG;
> +                       goto out;
> +               }
> +               nmsrs_out = __msr_io(vcpu, nmsrs_in, &vcpu->run->s.regs.msrs,
> +                                    vcpu->run->s.regs.msr_entries, do_set_msr);
> +               if (!nmsrs_out) {
> +                       r = -EINVAL;
> +                       goto out;
> +               } else if (nmsrs_in != nmsrs_out) {
> +                       r = nmsrs_out;
> +                       goto out;
> +               }
> +               vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_MSRS;
> +       }
> +       r = 0;
> +out:
> +       return r;
> +}
> +
>  static void fx_init(struct kvm_vcpu *vcpu)
>  {
>         fpstate_init(&vcpu->arch.guest_fpu.state);
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 838887587411..90a6b24df901 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -395,6 +395,10 @@ struct kvm_run {
>                 char padding[256];
>         };
>
> +       /* 2048 is the size of the char array IBM used to bound/pad the size
> +        * of the union that holds sync regs.
> +        */
> +#define SYNC_REGS_UNION_SIZE_BYTES 2048
>         /*
>          * shared registers between kvm and userspace.
>          * kvm_valid_regs specifies the register classes set by the host
> @@ -406,7 +410,7 @@ struct kvm_run {
>         __u64 kvm_dirty_regs;
>         union {
>                 struct kvm_sync_regs regs;
> -               char padding[2048];
> +               char padding[SYNC_REGS_UNION_SIZE_BYTES];
>         } s;
>  };
>
> --
> 2.14.1.690.gbb1197296e-goog
>
David Hildenbrand Sept. 28, 2017, 3:17 p.m. UTC | #2
On 20.09.2017 19:42, Ken Hofsass wrote:
> This commit implements an enhanced x86 version of S390
> KVM_CAP_SYNC_REGS functionality. KVM_CAP_SYNC_REGS "allow[s]
> userspace to access certain guest registers without having
> to call SET/GET_*REGS”. This reduces ioctl overhead which
> is particularly important when userspace is making synchronous
> guest state modifications (e.g. when emulating and/or intercepting
> instructions).
> 
> Originally implemented upstream for the S390, the x86 differences
> follow:
> - the capability can be enabled/disabled
> - the register sets to be copied out out to kvm_run are selectable
> by userspace

Why is this necessary? Why not simply store everything? And mark via
kvm_valid_regs which fields are actually valid?

Also, I wonder if user space could simply modify (reduce)
vcpu->run->kvm_valid_regs to achieve the same behavior (when storing in
the kernel, simply check if the valid bit is set).

> - vcpu_events and selectable MSRs are available in addition to the
> basic register sets (regs, sregs, debug_regs, and fpu).
> 
> Signed-off-by: Ken Hofsass <hofsass@google.com>
> ---
>  Documentation/virtual/kvm/api.txt |  49 ++++++++++++++
>  arch/x86/include/uapi/asm/kvm.h   |  32 ++++++++-
>  arch/x86/kvm/x86.c                | 134 +++++++++++++++++++++++++++++++++++++-
>  include/uapi/linux/kvm.h          |   6 +-
>  4 files changed, 216 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index e63a35fafef0..9536eb310d04 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3951,6 +3951,55 @@ Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
>  accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
>  the guest.
>  
> +6.74 KVM_CAP_SYNC_REGS
> +Architectures: s390, x86 (vmx-only)
> +Target: s390: always enabled, x86: vcpu
> +Parameters: s390: none, x86: args[0] takes bitmask selecting which register
> +sets to copy out (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
> +
> +As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
> +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
> +without having to call SET/GET_*REGS". This reduces overhead by eliminating
> +repeated ioctl calls for setting and/or getting register values. This is
> +particularly important when userspace is making synchronous guest state
> +modifications, e.g. when emulating and/or intercepting instructions in
> +userspace.
> +
> +For s390 specifics, please the source code.

"... please have a look at the source code."?

> +
> +For x86:
> +- the capability can be enabled/disabled (s390 is always enabled).

Move that s390 comment to "For s390".

You could simply always store and let user space control via
vcpu->run->kvm_valid_regs which ones to actually store. This would store
for old QEMUs, but do we care? This would get rid of the need for the
capability. User space could simply check vcpu->run->kvm_valid_regs
(assuming it was always set to 0, we would have to verify).

> +- the register sets to be copied out out to kvm_run are selectable
> +  by userspace (rather that all sets being copied out for every exit).
> +- vcpu_events and selectable MSRs are available in addition to the
> +  basic register sets (regs, sregs, debug_regs, and fpu).
> +> +In addition to the register sets, the x86 'kvm_sync_regs' struct
definition
> +includes the bitarray, 'sync_regs', that toggles whether specific register
> +sets are to be copied out. When enabling SYNC_REGS via KVM_CAP_ENABLE, the
> +value in args[0] is copied into the 'sync_regs' bitarray value by KVM. But
> +like the register sets, the copy-out field can be modified while handling
> +an exit so that additonal KVM_CAP_ENABLE ioctl calls can be avoided.
> +
> +To indicate when userspace has modified values that should be copied into
> +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
> +This is done using the same bitflags as for the 'sync_regs' copy-out field.
> +If the dirty bit is not set, then the register values will not be copied
> +into the vCPU even if they've been modified.
> +
> +Unused bitfields in the bitarrays must be set to zero.
> +
> +struct kvm_sync_regs {
> +        __u64 sync_regs;
> +        struct kvm_regs regs;
> +        struct kvm_sregs sregs;
> +        struct kvm_debugregs debugregs;
> +        struct kvm_fpu fpu;
> +        struct kvm_vcpu_events events;
> +        struct kvm_msrs msrs;
> +        struct kvm_msr_entry msr_entries[SYNC_REGS_NUM_MSRS];
> +};
> +
Paolo Bonzini Sept. 28, 2017, 3:34 p.m. UTC | #3
On 28/09/2017 17:17, David Hildenbrand wrote:
>>
>> Originally implemented upstream for the S390, the x86 differences
>> follow:
>> - the capability can be enabled/disabled
>> - the register sets to be copied out out to kvm_run are selectable
>> by userspace
> Why is this necessary? Why not simply store everything? And mark via
> kvm_valid_regs which fields are actually valid?

Storing everything would probably be pretty expensive.  I'm not sure
which MSRs you'd want to store explicitly (e.g. efer is already parts of
sregs) though---Bjorn what are you using?

I'm also not sure about the debug_regs... I like the patch, but I would
like to understand more how it's used (all I can guess is that it's
related to emulation in userspace).

I would also like to have a testcase for kvm-unit-tests api/ in order to
commit it.

Paolo

> Also, I wonder if user space could simply modify (reduce)
> vcpu->run->kvm_valid_regs to achieve the same behavior (when storing in
> the kernel, simply check if the valid bit is set).
>
Ken Hofsass Oct. 3, 2017, 1:40 a.m. UTC | #4
Hi David, Paolo, thanks for your reviews!

On Thu, Sep 28, 2017 at 8:17 AM, David Hildenbrand <david@redhat.com> wrote:
> On 20.09.2017 19:42, Ken Hofsass wrote:
> > - the register sets to be copied out out to kvm_run are selectable
> > by userspace
>
> Why is this necessary? Why not simply store everything? And mark via
> kvm_valid_regs which fields are actually valid?

As Paolo surmised, it becomes expensive to store all register sets for
every exit.

> Also, I wonder if user space could simply modify (reduce)
> vcpu->run->kvm_valid_regs to achieve the same behavior (when storing in
> the kernel, simply check if the valid bit is set).

I agree that we could overload kvm_valid_regs to be input/output
rather than having the separate u64 sync_regs for input only. I will
refactor for the next version of the patch.

> > +For s390 specifics, please the source code.
> "... please have a look at the source code."?
>
> > +
> > +For x86:
> > +- the capability can be enabled/disabled (s390 is always enabled).
> Move that s390 comment to "For s390".

Thanks for catching those, will incorporate.

> You could simply always store and let user space control via
> vcpu->run->kvm_valid_regs which ones to actually store.
> This would get rid of the need for the capability.

Yeah, looking at the code yet again, I saw your point that the ENABLE
section of the capability code doesn't provide any new utility.  I'll
take that part out.

However, I think keeping the QUERY part is needed for userspace to
have positive confirmation of the capability since it is too expensive
to forcibly store all the register sets. (And the capability value is
already defined from the S390 version so there's no additional
namespace pollution.)

On Thu, Sep 28, 2017 at 8:34 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> I'm not sure which MSRs you'd want to store explicitly (e.g. efer is already parts of
> sregs) though---Bjorn what are you using?

I could be swayed to remove this for now (more below).

> I'm also not sure about the debug_regs...

In part for completeness but also to have them available in a
non-interactive debugging/instrumentation scenario, e.g. when using
KVM to support driver/kernel development work.

> I like the patch, but I would
> like to understand more how it's used (all I can guess is that it's
> related to emulation in userspace).

Thanks! And you're right that reducing overhead for userspace
emulation (as presented by Andy Honig, et al at KVM forum) is part of
the rationale.

Similarly, we are also interested in building other low-level
functionality in userspace to avoid expanding kernel-level attack
surface. Toward that end, I expect to send you another patchset in the
next few weeks that builds on SYNC_REGS. (Monitoring MSRs might make
more sense when in that context.)

> I would also like to have a testcase for kvm-unit-tests api/ in order to
> commit it.

Certainly!  I'll get the test and the other fixes done and post a v2 patch.

thanks again,
Ken
Paolo Bonzini Oct. 3, 2017, 7:57 a.m. UTC | #5
On 03/10/2017 03:40, Ken Hofsass wrote:
>> I'm also not sure about the debug_regs...
> 
> In part for completeness but also to have them available in a
> non-interactive debugging/instrumentation scenario, e.g. when using
> KVM to support driver/kernel development work.

Since you are removing the "enable" part, querying the capability with
the "check extension" ioctl can return the supported register sets.  If
So we can extend sync_regs to debug registers in the future!

I agree that regs, sregs and events should be present since the first
revision.

Paolo
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e63a35fafef0..9536eb310d04 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3951,6 +3951,55 @@  Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
 accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
 the guest.
 
+6.74 KVM_CAP_SYNC_REGS
+Architectures: s390, x86 (vmx-only)
+Target: s390: always enabled, x86: vcpu
+Parameters: s390: none, x86: args[0] takes bitmask selecting which register
+sets to copy out (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
+
+As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
+KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
+without having to call SET/GET_*REGS". This reduces overhead by eliminating
+repeated ioctl calls for setting and/or getting register values. This is
+particularly important when userspace is making synchronous guest state
+modifications, e.g. when emulating and/or intercepting instructions in
+userspace.
+
+For s390 specifics, please the source code.
+
+For x86:
+- the capability can be enabled/disabled (s390 is always enabled).
+- the register sets to be copied out out to kvm_run are selectable
+  by userspace (rather that all sets being copied out for every exit).
+- vcpu_events and selectable MSRs are available in addition to the
+  basic register sets (regs, sregs, debug_regs, and fpu).
+
+In addition to the register sets, the x86 'kvm_sync_regs' struct definition
+includes the bitarray, 'sync_regs', that toggles whether specific register
+sets are to be copied out. When enabling SYNC_REGS via KVM_CAP_ENABLE, the
+value in args[0] is copied into the 'sync_regs' bitarray value by KVM. But
+like the register sets, the copy-out field can be modified while handling
+an exit so that additonal KVM_CAP_ENABLE ioctl calls can be avoided.
+
+To indicate when userspace has modified values that should be copied into
+the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
+This is done using the same bitflags as for the 'sync_regs' copy-out field.
+If the dirty bit is not set, then the register values will not be copied
+into the vCPU even if they've been modified.
+
+Unused bitfields in the bitarrays must be set to zero.
+
+struct kvm_sync_regs {
+        __u64 sync_regs;
+        struct kvm_regs regs;
+        struct kvm_sregs sregs;
+        struct kvm_debugregs debugregs;
+        struct kvm_fpu fpu;
+        struct kvm_vcpu_events events;
+        struct kvm_msrs msrs;
+        struct kvm_msr_entry msr_entries[SYNC_REGS_NUM_MSRS];
+};
+
 7. Capabilities that can be enabled on VMs
 ------------------------------------------
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index c2824d02ba37..cdee17a4b0ee 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -353,8 +353,38 @@  struct kvm_xcrs {
 	__u64 padding[16];
 };
 
-/* definition of registers in kvm_run */
+#define KVM_SYNC_X86_REGS      (1UL << 0)
+#define KVM_SYNC_X86_SREGS     (1UL << 1)
+#define KVM_SYNC_X86_DEBUGREGS (1UL << 2)
+#define KVM_SYNC_X86_FPU       (1UL << 3)
+#define KVM_SYNC_X86_EVENTS    (1UL << 4)
+#define KVM_SYNC_X86_MSRS      (1UL << 5)
+#define KVM_SYNC_X86_NUM_FIELDS		6
+
+#define KVM_SYNC_X86_VALID_BITS \
+	(KVM_SYNC_X86_REGS| \
+	 KVM_SYNC_X86_SREGS| \
+	 KVM_SYNC_X86_DEBUGREGS| \
+	 KVM_SYNC_X86_FPU| \
+	 KVM_SYNC_X86_EVENTS| \
+	 KVM_SYNC_X86_MSRS)
+
+#define SYNC_REGS_NUM_MSRS 8
+/* kvm_sync_regs struct included by kvm_run struct */
 struct kvm_sync_regs {
+	/* Members of this structure are potentially malicious.
+	 * Care must be taken by code reading, esp. interpreting,
+	 * data fields from them inside KVM to prevent TOCTOU and
+	 * double-fetch types of vulnerabilities.
+	 */
+	__u64 sync_regs;
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	struct kvm_debugregs debugregs;
+	struct kvm_fpu fpu;
+	struct kvm_vcpu_events events;
+	struct kvm_msrs msrs;
+	struct kvm_msr_entry msr_entries[SYNC_REGS_NUM_MSRS];
 };
 
 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cd17b7d9a107..ad3f84cbf7f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -100,6 +100,8 @@  static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
 static void enter_smm(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void sync_regs_store_to_kvmrun(struct kvm_vcpu *vcpu);
+static int sync_regs_load_from_kvmrun(struct kvm_vcpu *vcpu);
 
 struct kvm_x86_ops *kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -2587,15 +2589,22 @@  EXPORT_SYMBOL_GPL(kvm_get_msr_common);
  *
  * @return number of msrs set successfully.
  */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+static int __msr_io(struct kvm_vcpu *vcpu, u32 nmsrs, struct kvm_msrs *msrs,
 		    struct kvm_msr_entry *entries,
 		    int (*do_msr)(struct kvm_vcpu *vcpu,
 				  unsigned index, u64 *data))
 {
 	int i, idx;
 
+	/* For SYNC_REGS call into __msr_io, it is possible that userspace
+	 * might have modified the count since it was snapshotted. Could be
+	 * accidental but might be intentional attempt at TOCTOU.
+	 */
+	if (nmsrs != msrs->nmsrs)
+		return -EINVAL;
+
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
-	for (i = 0; i < msrs->nmsrs; ++i)
+	for (i = 0; i < nmsrs; ++i)
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -2633,7 +2642,7 @@  static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 		goto out;
 	}
 
-	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+	r = n = __msr_io(vcpu, msrs.nmsrs, &msrs, entries, do_msr);
 	if (r < 0)
 		goto out_free;
 
@@ -2665,6 +2674,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
+	case KVM_CAP_SYNC_REGS:
 	case KVM_CAP_USER_NMI:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
@@ -3435,6 +3445,11 @@  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		return kvm_hv_activate_synic(vcpu, cap->cap ==
 					     KVM_CAP_HYPERV_SYNIC2);
+	case KVM_CAP_SYNC_REGS:
+		if (cap->args[0] & ~KVM_SYNC_X86_VALID_BITS)
+			return -EINVAL;
+		vcpu->run->s.regs.sync_regs = cap->args[0];
+		return 0;
 	default:
 		return -EINVAL;
 	}
@@ -7247,6 +7262,12 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
+	if (vcpu->run->kvm_dirty_regs) {
+		r = sync_regs_load_from_kvmrun(vcpu);
+		if (r != 0)
+			goto out;
+	}
+
 	/* re-sync apic's tpr */
 	if (!lapic_in_kernel(vcpu)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
@@ -7270,6 +7291,8 @@  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		r = vcpu_run(vcpu);
 
 out:
+	if (vcpu->run->s.regs.sync_regs)
+		sync_regs_store_to_kvmrun(vcpu);
 	post_kvm_run_save(vcpu);
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -7648,6 +7671,111 @@  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
+static void sync_regs_store_to_kvmrun(struct kvm_vcpu *vcpu)
+{
+	BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_UNION_SIZE_BYTES);
+	vcpu->run->kvm_valid_regs = 0;
+
+	if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_REGS) {
+		kvm_arch_vcpu_ioctl_get_regs(vcpu, &vcpu->run->s.regs.regs);
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_REGS;
+	}
+	if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_SREGS) {
+		kvm_arch_vcpu_ioctl_get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_SREGS;
+	}
+	if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_DEBUGREGS) {
+		kvm_vcpu_ioctl_x86_get_debugregs(
+				vcpu, &vcpu->run->s.regs.debugregs);
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_DEBUGREGS;
+	}
+	if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_FPU) {
+		kvm_arch_vcpu_ioctl_get_fpu(vcpu, &vcpu->run->s.regs.fpu);
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_FPU;
+	}
+	if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_EVENTS) {
+		kvm_vcpu_ioctl_x86_get_vcpu_events(
+				vcpu, &vcpu->run->s.regs.events);
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_EVENTS;
+	}
+	if (vcpu->run->s.regs.sync_regs & KVM_SYNC_X86_MSRS) {
+		u32 nmsrs = vcpu->run->s.regs.msrs.nmsrs;
+
+		if (nmsrs > SYNC_REGS_NUM_MSRS)
+			nmsrs = vcpu->run->s.regs.msrs.nmsrs
+					= SYNC_REGS_NUM_MSRS;
+		__msr_io(vcpu, nmsrs, &vcpu->run->s.regs.msrs,
+			 vcpu->run->s.regs.msr_entries, do_get_msr);
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_X86_MSRS;
+	}
+}
+
+static int sync_regs_load_from_kvmrun(struct kvm_vcpu *vcpu)
+{
+	int r = -EINVAL;
+
+	if (unlikely(vcpu->run->kvm_dirty_regs == 0))
+		goto out;
+
+	if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_BITS)
+		goto out;
+
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+		if (kvm_arch_vcpu_ioctl_set_regs(
+				vcpu, &vcpu->run->s.regs.regs))
+			goto out;
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+	}
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+		if (kvm_arch_vcpu_ioctl_set_sregs(
+				vcpu, &vcpu->run->s.regs.sregs))
+			goto out;
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+	}
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_DEBUGREGS) {
+		if (kvm_vcpu_ioctl_x86_set_debugregs(
+				vcpu, &vcpu->run->s.regs.debugregs))
+			goto out;
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_DEBUGREGS;
+	}
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_FPU) {
+		if (kvm_arch_vcpu_ioctl_set_fpu(
+				vcpu, &vcpu->run->s.regs.fpu))
+			goto out;
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_FPU;
+	}
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
+		if (kvm_vcpu_ioctl_x86_set_vcpu_events(
+				vcpu, &vcpu->run->s.regs.events))
+			goto out;
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
+	}
+	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_MSRS) {
+		u32 nmsrs_in = vcpu->run->s.regs.msrs.nmsrs;
+		u32 nmsrs_out = 0;
+
+		if (!nmsrs_in)
+			goto out;
+		else if (nmsrs_in > SYNC_REGS_NUM_MSRS) {
+			r = -E2BIG;
+			goto out;
+		}
+		nmsrs_out = __msr_io(vcpu, nmsrs_in, &vcpu->run->s.regs.msrs,
+				     vcpu->run->s.regs.msr_entries, do_set_msr);
+		if (!nmsrs_out) {
+			r = -EINVAL;
+			goto out;
+		} else if (nmsrs_in != nmsrs_out) {
+			r = nmsrs_out;
+			goto out;
+		}
+		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_MSRS;
+	}
+	r = 0;
+out:
+	return r;
+}
+
 static void fx_init(struct kvm_vcpu *vcpu)
 {
 	fpstate_init(&vcpu->arch.guest_fpu.state);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 838887587411..90a6b24df901 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -395,6 +395,10 @@  struct kvm_run {
 		char padding[256];
 	};
 
+	/* 2048 is the size of the char array IBM used to bound/pad the size
+	 * of the union that holds sync regs.
+	 */
+#define SYNC_REGS_UNION_SIZE_BYTES 2048
 	/*
 	 * shared registers between kvm and userspace.
 	 * kvm_valid_regs specifies the register classes set by the host
@@ -406,7 +410,7 @@  struct kvm_run {
 	__u64 kvm_dirty_regs;
 	union {
 		struct kvm_sync_regs regs;
-		char padding[2048];
+		char padding[SYNC_REGS_UNION_SIZE_BYTES];
 	} s;
 };