diff mbox series

[PULL,20/25] target/i386: kvm: Add support for save and restore nested state

Message ID 1561116620-22245-21-git-send-email-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series [PULL,01/25] kvm-all: Add/update fprintf's for kvm_*_ioeventfd_del | expand

Commit Message

Paolo Bonzini June 21, 2019, 11:30 a.m. UTC
From: Liran Alon <liran.alon@oracle.com>

Kernel commit 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE")
introduced new IOCTLs to extract and restore vCPU state related to
Intel VMX & AMD SVM.

Utilize these IOCTLs to add support for migration of VMs which are
running nested hypervisors.

Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Maran Wilson <maran.wilson@oracle.com>
Tested-by: Maran Wilson <maran.wilson@oracle.com>
Signed-off-by: Liran Alon <liran.alon@oracle.com>
Message-Id: <20190619162140.133674-9-liran.alon@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/kvm/kvm-all.c   |   8 ++
 include/sysemu/kvm.h  |   1 +
 target/i386/cpu.h     |   3 +
 target/i386/kvm.c     |  80 ++++++++++++++++++++
 target/i386/machine.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 290 insertions(+)

Comments

Liran Alon June 21, 2019, 12:29 p.m. UTC | #1
> On 21 Jun 2019, at 14:30, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> From: Liran Alon <liran.alon@oracle.com>
> 
> Kernel commit 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE")
> introduced new IOCTLs to extract and restore vCPU state related to
> Intel VMX & AMD SVM.
> 
> Utilize these IOCTLs to add support for migration of VMs which are
> running nested hypervisors.
> 
> Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
> Reviewed-by: Maran Wilson <maran.wilson@oracle.com>
> Tested-by: Maran Wilson <maran.wilson@oracle.com>
> Signed-off-by: Liran Alon <liran.alon@oracle.com>
> Message-Id: <20190619162140.133674-9-liran.alon@oracle.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> accel/kvm/kvm-all.c   |   8 ++
> include/sysemu/kvm.h  |   1 +
> target/i386/cpu.h     |   3 +
> target/i386/kvm.c     |  80 ++++++++++++++++++++
> target/i386/machine.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 290 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f0f5ab8..e3cf728 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -87,6 +87,7 @@ struct KVMState
> #ifdef KVM_CAP_SET_GUEST_DEBUG
>     QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
> #endif
> +    int max_nested_state_len;
>     int many_ioeventfds;
>     int intx_set_mask;
>     bool sync_mmu;
> @@ -1681,6 +1682,8 @@ static int kvm_init(MachineState *ms)
>     s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
> #endif
> 
> +    s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
> +
> #ifdef KVM_CAP_IRQ_ROUTING
>     kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
> #endif
> @@ -2248,6 +2251,11 @@ int kvm_has_debugregs(void)
>     return kvm_state->debugregs;
> }
> 
> +int kvm_max_nested_state_length(void)
> +{
> +    return kvm_state->max_nested_state_len;
> +}
> +
> int kvm_has_many_ioeventfds(void)
> {
>     if (!kvm_enabled()) {
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 64f55e5..acd90ae 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -210,6 +210,7 @@ bool kvm_has_sync_mmu(void);
> int kvm_has_vcpu_events(void);
> int kvm_has_robust_singlestep(void);
> int kvm_has_debugregs(void);
> +int kvm_max_nested_state_length(void);
> int kvm_has_pit_state2(void);
> int kvm_has_many_ioeventfds(void);
> int kvm_has_gsi_routing(void);
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index bf0c9c2..17116ef 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1360,6 +1360,9 @@ typedef struct CPUX86State {
> #if defined(CONFIG_KVM) || defined(CONFIG_HVF)
>     void *xsave_buf;
> #endif
> +#if defined(CONFIG_KVM)
> +    struct kvm_nested_state *nested_state;
> +#endif
> #if defined(CONFIG_HVF)
>     HVFX86EmulatorState *hvf_emul;
> #endif
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index f9872f1..e924663 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -1324,6 +1324,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
>     struct kvm_cpuid_entry2 *c;
>     uint32_t signature[3];
>     int kvm_base = KVM_CPUID_SIGNATURE;
> +    int max_nested_state_len;
>     int r;
>     Error *local_err = NULL;
> 
> @@ -1658,6 +1659,24 @@ int kvm_arch_init_vcpu(CPUState *cs)
>     if (has_xsave) {
>         env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
>     }
> +
> +    max_nested_state_len = kvm_max_nested_state_length();
> +    if (max_nested_state_len > 0) {
> +        assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
> +        env->nested_state = g_malloc0(max_nested_state_len);
> +
> +        env->nested_state->size = max_nested_state_len;
> +
> +        if (IS_INTEL_CPU(env)) {

I think it’s better to change this to: “if (cpu_has_vmx(env))” {

> +            struct kvm_vmx_nested_state_hdr *vmx_hdr =
> +                &env->nested_state->hdr.vmx;
> +
> +            env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
> +            vmx_hdr->vmxon_pa = -1ull;
> +            vmx_hdr->vmcs12_pa = -1ull;
> +        }
> +    }

I think we should add here:
} else if (cpu_has_svm(env)) {
    env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
}

> +
>     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
> 
>     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
> @@ -1682,12 +1701,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
> int kvm_arch_destroy_vcpu(CPUState *cs)
> {
>     X86CPU *cpu = X86_CPU(cs);
> +    CPUX86State *env = &cpu->env;
> 
>     if (cpu->kvm_msr_buf) {
>         g_free(cpu->kvm_msr_buf);
>         cpu->kvm_msr_buf = NULL;
>     }
> 
> +    if (env->nested_state) {
> +        g_free(env->nested_state);
> +        env->nested_state = NULL;
> +    }
> +
>     return 0;
> }
> 
> @@ -3411,6 +3436,52 @@ static int kvm_get_debugregs(X86CPU *cpu)
>     return 0;
> }
> 
> +static int kvm_put_nested_state(X86CPU *cpu)
> +{
> +    CPUX86State *env = &cpu->env;
> +    int max_nested_state_len = kvm_max_nested_state_length();
> +
> +    if (max_nested_state_len <= 0) {
> +        return 0;
> +    }
> +
> +    assert(env->nested_state->size <= max_nested_state_len);
> +    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
> +}
> +
> +static int kvm_get_nested_state(X86CPU *cpu)
> +{
> +    CPUX86State *env = &cpu->env;
> +    int max_nested_state_len = kvm_max_nested_state_length();
> +    int ret;
> +
> +    if (max_nested_state_len <= 0) {
> +        return 0;
> +    }
> +
> +    /*
> +     * It is possible that migration restored a smaller size into
> +     * nested_state->hdr.size than what our kernel support.
> +     * We preserve migration origin nested_state->hdr.size for
> +     * call to KVM_SET_NESTED_STATE but wish that our next call
> +     * to KVM_GET_NESTED_STATE will use max size our kernel support.
> +     */
> +    env->nested_state->size = max_nested_state_len;
> +
> +    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
> +        env->hflags |= HF_GUEST_MASK;
> +    } else {
> +        env->hflags &= ~HF_GUEST_MASK;
> +    }
> +
> +    return ret;
> +}
> +
> int kvm_arch_put_registers(CPUState *cpu, int level)
> {
>     X86CPU *x86_cpu = X86_CPU(cpu);
> @@ -3418,6 +3489,11 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> 
>     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
> 
> +    ret = kvm_put_nested_state(x86_cpu);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
>     if (level >= KVM_PUT_RESET_STATE) {
>         ret = kvm_put_msr_feature_control(x86_cpu);
>         if (ret < 0) {
> @@ -3533,6 +3609,10 @@ int kvm_arch_get_registers(CPUState *cs)
>     if (ret < 0) {
>         goto out;
>     }
> +    ret = kvm_get_nested_state(cpu);
> +    if (ret < 0) {
> +        goto out;
> +    }
>     ret = 0;
>  out:
>     cpu_sync_bndcs_hflags(&cpu->env);
> diff --git a/target/i386/machine.c b/target/i386/machine.c
> index a39ce7f..a6afdf8 100644
> --- a/target/i386/machine.c
> +++ b/target/i386/machine.c
> @@ -231,6 +231,15 @@ static int cpu_pre_save(void *opaque)
>         env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
>     }
> 
> +#ifdef CONFIG_KVM
> +    /* Verify we have nested virtualization state from kernel if required */
> +    if (cpu_has_nested_virt(env) && !env->nested_state) {
> +        error_report("Guest enabled nested virtualization but kernel "
> +                "does not support saving of nested state");
> +        return -EINVAL;
> +    }
> +#endif

Paolo, just note that this will effectively block migration for AMD vCPU exposed with SVM.
I think it’s the right thing to do but you were afraid of backwards compatibility regarding this no?

-Liran

> +
>     return 0;
> }
> 
> @@ -278,6 +287,16 @@ static int cpu_post_load(void *opaque, int version_id)
>     env->hflags &= ~HF_CPL_MASK;
>     env->hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
> 
> +#ifdef CONFIG_KVM
> +    if ((env->hflags & HF_GUEST_MASK) &&
> +        (!env->nested_state ||
> +        !(env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE))) {
> +        error_report("vCPU set in guest-mode inconsistent with "
> +                     "migrated kernel nested state");
> +        return -EINVAL;
> +    }
> +#endif
> +
>     env->fpstt = (env->fpus_vmstate >> 11) & 7;
>     env->fpus = env->fpus_vmstate & ~0x3800;
>     env->fptag_vmstate ^= 0xff;
> @@ -851,6 +870,182 @@ static const VMStateDescription vmstate_tsc_khz = {
>     }
> };
> 
> +#ifdef CONFIG_KVM
> +
> +static bool vmx_vmcs12_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +    return (nested_state->size >
> +            offsetof(struct kvm_nested_state, data.vmx[0].vmcs12));
> +}
> +
> +static const VMStateDescription vmstate_vmx_vmcs12 = {
> +    .name = "cpu/kvm_nested_state/vmx/vmcs12",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = vmx_vmcs12_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT8_ARRAY(data.vmx[0].vmcs12,
> +                            struct kvm_nested_state,
> +                            KVM_STATE_NESTED_VMX_VMCS_SIZE),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static bool vmx_shadow_vmcs12_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +    return (nested_state->size >
> +            offsetof(struct kvm_nested_state, data.vmx[0].shadow_vmcs12));
> +}
> +
> +static const VMStateDescription vmstate_vmx_shadow_vmcs12 = {
> +    .name = "cpu/kvm_nested_state/vmx/shadow_vmcs12",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = vmx_shadow_vmcs12_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT8_ARRAY(data.vmx[0].shadow_vmcs12,
> +                            struct kvm_nested_state,
> +                            KVM_STATE_NESTED_VMX_VMCS_SIZE),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static bool vmx_nested_state_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +
> +    return ((nested_state->format == KVM_STATE_NESTED_FORMAT_VMX) &&
> +            ((nested_state->hdr.vmx.vmxon_pa != -1ull) ||
> +             (nested_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)));
> +}
> +
> +static const VMStateDescription vmstate_vmx_nested_state = {
> +    .name = "cpu/kvm_nested_state/vmx",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = vmx_nested_state_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_U64(hdr.vmx.vmxon_pa, struct kvm_nested_state),
> +        VMSTATE_U64(hdr.vmx.vmcs12_pa, struct kvm_nested_state),
> +        VMSTATE_U16(hdr.vmx.smm.flags, struct kvm_nested_state),
> +        VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription*[]) {
> +        &vmstate_vmx_vmcs12,
> +        &vmstate_vmx_shadow_vmcs12,
> +        NULL,
> +    }
> +};
> +
> +static bool svm_nested_state_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +
> +    return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM);
> +}
> +
> +static const VMStateDescription vmstate_svm_nested_state = {
> +    .name = "cpu/kvm_nested_state/svm",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = svm_nested_state_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static bool nested_state_needed(void *opaque)
> +{
> +    X86CPU *cpu = opaque;
> +    CPUX86State *env = &cpu->env;
> +
> +    return (env->nested_state &&
> +            (vmx_nested_state_needed(env->nested_state) ||
> +             svm_nested_state_needed(env->nested_state)));
> +}
> +
> +static int nested_state_post_load(void *opaque, int version_id)
> +{
> +    X86CPU *cpu = opaque;
> +    CPUX86State *env = &cpu->env;
> +    struct kvm_nested_state *nested_state = env->nested_state;
> +    int min_nested_state_len = offsetof(struct kvm_nested_state, data);
> +    int max_nested_state_len = kvm_max_nested_state_length();
> +
> +    /*
> +     * If our kernel don't support setting nested state
> +     * and we have received nested state from migration stream,
> +     * we need to fail migration
> +     */
> +    if (max_nested_state_len <= 0) {
> +        error_report("Received nested state when kernel cannot restore it");
> +        return -EINVAL;
> +    }
> +
> +    /*
> +     * Verify that the size of received nested_state struct
> +     * at least cover required header and is not larger
> +     * than the max size that our kernel support
> +     */
> +    if (nested_state->size < min_nested_state_len) {
> +        error_report("Received nested state size less than min: "
> +                     "len=%d, min=%d",
> +                     nested_state->size, min_nested_state_len);
> +        return -EINVAL;
> +    }
> +    if (nested_state->size > max_nested_state_len) {
> +        error_report("Recieved unsupported nested state size: "
> +                     "nested_state->size=%d, max=%d",
> +                     nested_state->size, max_nested_state_len);
> +        return -EINVAL;
> +    }
> +
> +    /* Verify format is valid */
> +    if ((nested_state->format != KVM_STATE_NESTED_FORMAT_VMX) &&
> +        (nested_state->format != KVM_STATE_NESTED_FORMAT_SVM)) {
> +        error_report("Received invalid nested state format: %d",
> +                     nested_state->format);
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static const VMStateDescription vmstate_kvm_nested_state = {
> +    .name = "cpu/kvm_nested_state",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_U16(flags, struct kvm_nested_state),
> +        VMSTATE_U16(format, struct kvm_nested_state),
> +        VMSTATE_U32(size, struct kvm_nested_state),
> +        VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription*[]) {
> +        &vmstate_vmx_nested_state,
> +        &vmstate_svm_nested_state,
> +        NULL
> +    }
> +};
> +
> +static const VMStateDescription vmstate_nested_state = {
> +    .name = "cpu/nested_state",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = nested_state_needed,
> +    .post_load = nested_state_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_STRUCT_POINTER(env.nested_state, X86CPU,
> +                vmstate_kvm_nested_state,
> +                struct kvm_nested_state),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +#endif
> +
> static bool mcg_ext_ctl_needed(void *opaque)
> {
>     X86CPU *cpu = opaque;
> @@ -1113,6 +1308,9 @@ VMStateDescription vmstate_x86_cpu = {
> #ifndef TARGET_X86_64
>         &vmstate_efer32,
> #endif
> +#ifdef CONFIG_KVM
> +        &vmstate_nested_state,
> +#endif
>         NULL
>     }
> };
> -- 
> 1.8.3.1
> 
>
Paolo Bonzini June 21, 2019, 12:45 p.m. UTC | #2
On 21/06/19 14:29, Liran Alon wrote:
>> +    max_nested_state_len = kvm_max_nested_state_length();
>> +    if (max_nested_state_len > 0) {
>> +        assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
>> +        env->nested_state = g_malloc0(max_nested_state_len);
>> +
>> +        env->nested_state->size = max_nested_state_len;
>> +
>> +        if (IS_INTEL_CPU(env)) {
> I think it’s better to change this to: “if (cpu_has_vmx(env))” {
> 
>> +            struct kvm_vmx_nested_state_hdr *vmx_hdr =
>> +                &env->nested_state->hdr.vmx;
>> +
>> +            env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
>> +            vmx_hdr->vmxon_pa = -1ull;
>> +            vmx_hdr->vmcs12_pa = -1ull;
>> +        }
>> +    }
> I think we should add here:
> } else if (cpu_has_svm(env)) {
>     env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
> }

Or even force max_nested_state_len to 0 for AMD hosts, so that
kvm_get/put_nested_state are dropped completely.

Paolo
Liran Alon June 21, 2019, 12:48 p.m. UTC | #3
> On 21 Jun 2019, at 15:45, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 21/06/19 14:29, Liran Alon wrote:
>>> +    max_nested_state_len = kvm_max_nested_state_length();
>>> +    if (max_nested_state_len > 0) {
>>> +        assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
>>> +        env->nested_state = g_malloc0(max_nested_state_len);
>>> +
>>> +        env->nested_state->size = max_nested_state_len;
>>> +
>>> +        if (IS_INTEL_CPU(env)) {
>> I think it’s better to change this to: “if (cpu_has_vmx(env))” {
>> 
>>> +            struct kvm_vmx_nested_state_hdr *vmx_hdr =
>>> +                &env->nested_state->hdr.vmx;
>>> +
>>> +            env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
>>> +            vmx_hdr->vmxon_pa = -1ull;
>>> +            vmx_hdr->vmcs12_pa = -1ull;
>>> +        }
>>> +    }
>> I think we should add here:
>> } else if (cpu_has_svm(env)) {
>>    env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
>> }
> 
> Or even force max_nested_state_len to 0 for AMD hosts, so that
> kvm_get/put_nested_state are dropped completely.
> 
> Paolo
> 

On AMD hosts, KVM returns 0 for KVM_CAP_NESTED_STATE because
Kvm-and.ko have kvm_x86_ops->get_nested_state set to NULL.
See kvm_vm_ioctl_check_extension().

I just thought it will be nicer to add what I proposed above as when kernel adds support
for nested state on AMD host, QEMU would maybe just work.
(Because maybe all state required for AMD nSVM is just flags in env->nested_state->flags).

-Liran
Paolo Bonzini June 21, 2019, 2:55 p.m. UTC | #4
On 21/06/19 14:48, Liran Alon wrote:
> 
> 
>> On 21 Jun 2019, at 15:45, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> On 21/06/19 14:29, Liran Alon wrote:
>>>> +    max_nested_state_len = kvm_max_nested_state_length();
>>>> +    if (max_nested_state_len > 0) {
>>>> +        assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
>>>> +        env->nested_state = g_malloc0(max_nested_state_len);
>>>> +
>>>> +        env->nested_state->size = max_nested_state_len;
>>>> +
>>>> +        if (IS_INTEL_CPU(env)) {
>>> I think it’s better to change this to: “if (cpu_has_vmx(env))” {
>>>
>>>> +            struct kvm_vmx_nested_state_hdr *vmx_hdr =
>>>> +                &env->nested_state->hdr.vmx;
>>>> +
>>>> +            env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
>>>> +            vmx_hdr->vmxon_pa = -1ull;
>>>> +            vmx_hdr->vmcs12_pa = -1ull;
>>>> +        }
>>>> +    }
>>> I think we should add here:
>>> } else if (cpu_has_svm(env)) {
>>>    env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
>>> }
>>
>> Or even force max_nested_state_len to 0 for AMD hosts, so that
>> kvm_get/put_nested_state are dropped completely.
>>
>> Paolo
>>
> 
> On AMD hosts, KVM returns 0 for KVM_CAP_NESTED_STATE because
> Kvm-and.ko have kvm_x86_ops->get_nested_state set to NULL.
> See kvm_vm_ioctl_check_extension().

Yes, now it does, my idea was to force that behavior in QEMU until we
know what SVM support actually looks like.

In principle I don't like the idea of crossing fingers, even though
there's an actual possibility that it could work.  But it couldn't be
worse than what we have now, so maybe it *is* actually a good idea, just
with some comment that explains the rationale.

Paolo


> I just thought it will be nicer to add what I proposed above as when kernel adds support
> for nested state on AMD host, QEMU would maybe just work.
> (Because maybe all state required for AMD nSVM is just flags in env->nested_state->flags).
> 
> -Liran
>
Liran Alon June 21, 2019, 3 p.m. UTC | #5
> On 21 Jun 2019, at 17:55, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 21/06/19 14:48, Liran Alon wrote:
>> 
>> 
>>> On 21 Jun 2019, at 15:45, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> 
>>> On 21/06/19 14:29, Liran Alon wrote:
>>>>> +    max_nested_state_len = kvm_max_nested_state_length();
>>>>> +    if (max_nested_state_len > 0) {
>>>>> +        assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
>>>>> +        env->nested_state = g_malloc0(max_nested_state_len);
>>>>> +
>>>>> +        env->nested_state->size = max_nested_state_len;
>>>>> +
>>>>> +        if (IS_INTEL_CPU(env)) {
>>>> I think it’s better to change this to: “if (cpu_has_vmx(env))” {
>>>> 
>>>>> +            struct kvm_vmx_nested_state_hdr *vmx_hdr =
>>>>> +                &env->nested_state->hdr.vmx;
>>>>> +
>>>>> +            env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
>>>>> +            vmx_hdr->vmxon_pa = -1ull;
>>>>> +            vmx_hdr->vmcs12_pa = -1ull;
>>>>> +        }
>>>>> +    }
>>>> I think we should add here:
>>>> } else if (cpu_has_svm(env)) {
>>>>   env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
>>>> }
>>> 
>>> Or even force max_nested_state_len to 0 for AMD hosts, so that
>>> kvm_get/put_nested_state are dropped completely.
>>> 
>>> Paolo
>>> 
>> 
>> On AMD hosts, KVM returns 0 for KVM_CAP_NESTED_STATE because
>> Kvm-and.ko have kvm_x86_ops->get_nested_state set to NULL.
>> See kvm_vm_ioctl_check_extension().
> 
> Yes, now it does, my idea was to force that behavior in QEMU until we
> know what SVM support actually looks like.
> 
> In principle I don't like the idea of crossing fingers, even though
> there's an actual possibility that it could work.  But it couldn't be
> worse than what we have now, so maybe it *is* actually a good idea, just
> with some comment that explains the rationale.
> 
> Paolo

Cool.
Are you planning to make those changes when applying/merging or
do you need me to submit a new patch-series version?
Also note my comment on the other patch regarding block migration on AMD vCPU which expose SVM.

-Liran

> 
> 
>> I just thought it will be nicer to add what I proposed above as when kernel adds support
>> for nested state on AMD host, QEMU would maybe just work.
>> (Because maybe all state required for AMD nSVM is just flags in env->nested_state->flags).
>> 
>> -Liran
>> 
>
Paolo Bonzini June 21, 2019, 3:39 p.m. UTC | #6
On 21/06/19 17:00, Liran Alon wrote:
> Cool.
> Are you planning to make those changes when applying/merging or
> do you need me to submit a new patch-series version?
> Also note my comment on the other patch regarding block migration on AMD vCPU which expose SVM.

It's already merged, but it's not a big deal since it's only AMD.  We
can follow up before release.

Paolo
Liran Alon June 21, 2019, 3:44 p.m. UTC | #7
> On 21 Jun 2019, at 18:39, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 21/06/19 17:00, Liran Alon wrote:
>> Cool.
>> Are you planning to make those changes when applying/merging or
>> do you need me to submit a new patch-series version?
>> Also note my comment on the other patch regarding block migration on AMD vCPU which expose SVM.
> 
> It's already merged, but it's not a big deal since it's only AMD.  We
> can follow up before release.
> 
> Paolo

Ok then. We at least now have nVMX migration working in QEMU! :)
I will just submit additional separate patches on top of QEMU master.

Thanks,
-Liran
Liran Alon June 21, 2019, 4:01 p.m. UTC | #8
> On 21 Jun 2019, at 18:44, Liran Alon <liran.alon@oracle.com> wrote:
> 
> 
> 
>> On 21 Jun 2019, at 18:39, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> 
>> On 21/06/19 17:00, Liran Alon wrote:
>>> Cool.
>>> Are you planning to make those changes when applying/merging or
>>> do you need me to submit a new patch-series version?
>>> Also note my comment on the other patch regarding block migration on AMD vCPU which expose SVM.
>> 
>> It's already merged, but it's not a big deal since it's only AMD.  We
>> can follow up before release.
>> 
>> Paolo
> 
> Ok then. We at least now have nVMX migration working in QEMU! :)
> I will just submit additional separate patches on top of QEMU master.
> 
> Thanks,
> -Liran
> 

Oh the applied patch-series is not very nice actually…
It seems that some of the commits cannot even compile such as "target/i386: kvm: Block migration for vCPUs exposed with nested virtualization".
You have removed cpu_has_nested_virt(env) from that commit even though it is used…
But as the author of the commit I will be blamed for this broken bisection :(
LOL. Oh well… Mistakes happen. :)

-Liran
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f0f5ab8..e3cf728 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -87,6 +87,7 @@  struct KVMState
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
 #endif
+    int max_nested_state_len;
     int many_ioeventfds;
     int intx_set_mask;
     bool sync_mmu;
@@ -1681,6 +1682,8 @@  static int kvm_init(MachineState *ms)
     s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
 #endif
 
+    s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
+
 #ifdef KVM_CAP_IRQ_ROUTING
     kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
 #endif
@@ -2248,6 +2251,11 @@  int kvm_has_debugregs(void)
     return kvm_state->debugregs;
 }
 
+int kvm_max_nested_state_length(void)
+{
+    return kvm_state->max_nested_state_len;
+}
+
 int kvm_has_many_ioeventfds(void)
 {
     if (!kvm_enabled()) {
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 64f55e5..acd90ae 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -210,6 +210,7 @@  bool kvm_has_sync_mmu(void);
 int kvm_has_vcpu_events(void);
 int kvm_has_robust_singlestep(void);
 int kvm_has_debugregs(void);
+int kvm_max_nested_state_length(void);
 int kvm_has_pit_state2(void);
 int kvm_has_many_ioeventfds(void);
 int kvm_has_gsi_routing(void);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index bf0c9c2..17116ef 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1360,6 +1360,9 @@  typedef struct CPUX86State {
 #if defined(CONFIG_KVM) || defined(CONFIG_HVF)
     void *xsave_buf;
 #endif
+#if defined(CONFIG_KVM)
+    struct kvm_nested_state *nested_state;
+#endif
 #if defined(CONFIG_HVF)
     HVFX86EmulatorState *hvf_emul;
 #endif
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index f9872f1..e924663 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1324,6 +1324,7 @@  int kvm_arch_init_vcpu(CPUState *cs)
     struct kvm_cpuid_entry2 *c;
     uint32_t signature[3];
     int kvm_base = KVM_CPUID_SIGNATURE;
+    int max_nested_state_len;
     int r;
     Error *local_err = NULL;
 
@@ -1658,6 +1659,24 @@  int kvm_arch_init_vcpu(CPUState *cs)
     if (has_xsave) {
         env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
     }
+
+    max_nested_state_len = kvm_max_nested_state_length();
+    if (max_nested_state_len > 0) {
+        assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
+        env->nested_state = g_malloc0(max_nested_state_len);
+
+        env->nested_state->size = max_nested_state_len;
+
+        if (IS_INTEL_CPU(env)) {
+            struct kvm_vmx_nested_state_hdr *vmx_hdr =
+                &env->nested_state->hdr.vmx;
+
+            env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
+            vmx_hdr->vmxon_pa = -1ull;
+            vmx_hdr->vmcs12_pa = -1ull;
+        }
+    }
+
     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
 
     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
@@ -1682,12 +1701,18 @@  int kvm_arch_init_vcpu(CPUState *cs)
 int kvm_arch_destroy_vcpu(CPUState *cs)
 {
     X86CPU *cpu = X86_CPU(cs);
+    CPUX86State *env = &cpu->env;
 
     if (cpu->kvm_msr_buf) {
         g_free(cpu->kvm_msr_buf);
         cpu->kvm_msr_buf = NULL;
     }
 
+    if (env->nested_state) {
+        g_free(env->nested_state);
+        env->nested_state = NULL;
+    }
+
     return 0;
 }
 
@@ -3411,6 +3436,52 @@  static int kvm_get_debugregs(X86CPU *cpu)
     return 0;
 }
 
+static int kvm_put_nested_state(X86CPU *cpu)
+{
+    CPUX86State *env = &cpu->env;
+    int max_nested_state_len = kvm_max_nested_state_length();
+
+    if (max_nested_state_len <= 0) {
+        return 0;
+    }
+
+    assert(env->nested_state->size <= max_nested_state_len);
+    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
+}
+
+static int kvm_get_nested_state(X86CPU *cpu)
+{
+    CPUX86State *env = &cpu->env;
+    int max_nested_state_len = kvm_max_nested_state_length();
+    int ret;
+
+    if (max_nested_state_len <= 0) {
+        return 0;
+    }
+
+    /*
+     * It is possible that migration restored a smaller size into
+     * nested_state->hdr.size than what our kernel support.
+     * We preserve migration origin nested_state->hdr.size for
+     * call to KVM_SET_NESTED_STATE but wish that our next call
+     * to KVM_GET_NESTED_STATE will use max size our kernel support.
+     */
+    env->nested_state->size = max_nested_state_len;
+
+    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
+        env->hflags |= HF_GUEST_MASK;
+    } else {
+        env->hflags &= ~HF_GUEST_MASK;
+    }
+
+    return ret;
+}
+
 int kvm_arch_put_registers(CPUState *cpu, int level)
 {
     X86CPU *x86_cpu = X86_CPU(cpu);
@@ -3418,6 +3489,11 @@  int kvm_arch_put_registers(CPUState *cpu, int level)
 
     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
 
+    ret = kvm_put_nested_state(x86_cpu);
+    if (ret < 0) {
+        return ret;
+    }
+
     if (level >= KVM_PUT_RESET_STATE) {
         ret = kvm_put_msr_feature_control(x86_cpu);
         if (ret < 0) {
@@ -3533,6 +3609,10 @@  int kvm_arch_get_registers(CPUState *cs)
     if (ret < 0) {
         goto out;
     }
+    ret = kvm_get_nested_state(cpu);
+    if (ret < 0) {
+        goto out;
+    }
     ret = 0;
  out:
     cpu_sync_bndcs_hflags(&cpu->env);
diff --git a/target/i386/machine.c b/target/i386/machine.c
index a39ce7f..a6afdf8 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -231,6 +231,15 @@  static int cpu_pre_save(void *opaque)
         env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
     }
 
+#ifdef CONFIG_KVM
+    /* Verify we have nested virtualization state from kernel if required */
+    if (cpu_has_nested_virt(env) && !env->nested_state) {
+        error_report("Guest enabled nested virtualization but kernel "
+                "does not support saving of nested state");
+        return -EINVAL;
+    }
+#endif
+
     return 0;
 }
 
@@ -278,6 +287,16 @@  static int cpu_post_load(void *opaque, int version_id)
     env->hflags &= ~HF_CPL_MASK;
     env->hflags |= (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
 
+#ifdef CONFIG_KVM
+    if ((env->hflags & HF_GUEST_MASK) &&
+        (!env->nested_state ||
+        !(env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE))) {
+        error_report("vCPU set in guest-mode inconsistent with "
+                     "migrated kernel nested state");
+        return -EINVAL;
+    }
+#endif
+
     env->fpstt = (env->fpus_vmstate >> 11) & 7;
     env->fpus = env->fpus_vmstate & ~0x3800;
     env->fptag_vmstate ^= 0xff;
@@ -851,6 +870,182 @@  static const VMStateDescription vmstate_tsc_khz = {
     }
 };
 
+#ifdef CONFIG_KVM
+
+static bool vmx_vmcs12_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+    return (nested_state->size >
+            offsetof(struct kvm_nested_state, data.vmx[0].vmcs12));
+}
+
+static const VMStateDescription vmstate_vmx_vmcs12 = {
+    .name = "cpu/kvm_nested_state/vmx/vmcs12",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = vmx_vmcs12_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8_ARRAY(data.vmx[0].vmcs12,
+                            struct kvm_nested_state,
+                            KVM_STATE_NESTED_VMX_VMCS_SIZE),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static bool vmx_shadow_vmcs12_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+    return (nested_state->size >
+            offsetof(struct kvm_nested_state, data.vmx[0].shadow_vmcs12));
+}
+
+static const VMStateDescription vmstate_vmx_shadow_vmcs12 = {
+    .name = "cpu/kvm_nested_state/vmx/shadow_vmcs12",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = vmx_shadow_vmcs12_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8_ARRAY(data.vmx[0].shadow_vmcs12,
+                            struct kvm_nested_state,
+                            KVM_STATE_NESTED_VMX_VMCS_SIZE),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static bool vmx_nested_state_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+
+    return ((nested_state->format == KVM_STATE_NESTED_FORMAT_VMX) &&
+            ((nested_state->hdr.vmx.vmxon_pa != -1ull) ||
+             (nested_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)));
+}
+
+static const VMStateDescription vmstate_vmx_nested_state = {
+    .name = "cpu/kvm_nested_state/vmx",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = vmx_nested_state_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_U64(hdr.vmx.vmxon_pa, struct kvm_nested_state),
+        VMSTATE_U64(hdr.vmx.vmcs12_pa, struct kvm_nested_state),
+        VMSTATE_U16(hdr.vmx.smm.flags, struct kvm_nested_state),
+        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription*[]) {
+        &vmstate_vmx_vmcs12,
+        &vmstate_vmx_shadow_vmcs12,
+        NULL,
+    }
+};
+
+static bool svm_nested_state_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+
+    return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM);
+}
+
+static const VMStateDescription vmstate_svm_nested_state = {
+    .name = "cpu/kvm_nested_state/svm",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = svm_nested_state_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static bool nested_state_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+
+    return (env->nested_state &&
+            (vmx_nested_state_needed(env->nested_state) ||
+             svm_nested_state_needed(env->nested_state)));
+}
+
+static int nested_state_post_load(void *opaque, int version_id)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+    struct kvm_nested_state *nested_state = env->nested_state;
+    int min_nested_state_len = offsetof(struct kvm_nested_state, data);
+    int max_nested_state_len = kvm_max_nested_state_length();
+
+    /*
+     * If our kernel don't support setting nested state
+     * and we have received nested state from migration stream,
+     * we need to fail migration
+     */
+    if (max_nested_state_len <= 0) {
+        error_report("Received nested state when kernel cannot restore it");
+        return -EINVAL;
+    }
+
+    /*
+     * Verify that the size of received nested_state struct
+     * at least cover required header and is not larger
+     * than the max size that our kernel support
+     */
+    if (nested_state->size < min_nested_state_len) {
+        error_report("Received nested state size less than min: "
+                     "len=%d, min=%d",
+                     nested_state->size, min_nested_state_len);
+        return -EINVAL;
+    }
+    if (nested_state->size > max_nested_state_len) {
+        error_report("Recieved unsupported nested state size: "
+                     "nested_state->size=%d, max=%d",
+                     nested_state->size, max_nested_state_len);
+        return -EINVAL;
+    }
+
+    /* Verify format is valid */
+    if ((nested_state->format != KVM_STATE_NESTED_FORMAT_VMX) &&
+        (nested_state->format != KVM_STATE_NESTED_FORMAT_SVM)) {
+        error_report("Received invalid nested state format: %d",
+                     nested_state->format);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_kvm_nested_state = {
+    .name = "cpu/kvm_nested_state",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_U16(flags, struct kvm_nested_state),
+        VMSTATE_U16(format, struct kvm_nested_state),
+        VMSTATE_U32(size, struct kvm_nested_state),
+        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription*[]) {
+        &vmstate_vmx_nested_state,
+        &vmstate_svm_nested_state,
+        NULL
+    }
+};
+
+static const VMStateDescription vmstate_nested_state = {
+    .name = "cpu/nested_state",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = nested_state_needed,
+    .post_load = nested_state_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_STRUCT_POINTER(env.nested_state, X86CPU,
+                vmstate_kvm_nested_state,
+                struct kvm_nested_state),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+#endif
+
 static bool mcg_ext_ctl_needed(void *opaque)
 {
     X86CPU *cpu = opaque;
@@ -1113,6 +1308,9 @@  VMStateDescription vmstate_x86_cpu = {
 #ifndef TARGET_X86_64
         &vmstate_efer32,
 #endif
+#ifdef CONFIG_KVM
+        &vmstate_nested_state,
+#endif
         NULL
     }
 };