Message ID | 20171218171742.5765-3-vkuznets@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Yikes! This breaks migration to/from older versions of kvm. Will you be submitting another change to handle dynamic conversion between formats? On Mon, Dec 18, 2017 at 9:17 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > From: Ladi Prosek <lprosek@redhat.com> > > Reorders existing fields and adds fields specific to Hyper-V. The layout > now matches Hyper-V TLFS 5.0b 16.11.2 Enlightened VMCS. > > Fields used by KVM but missing from Hyper-V are placed in the second half > of the VMCS page to minimize the chances they will clash with future > enlightened VMCS versions. > > Signed-off-by: Ladi Prosek <lprosek@redhat.com> > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> > --- > [Vitaly]: Update VMCS12_REVISION to some new arbitrary number. > --- > arch/x86/kvm/vmx.c | 321 +++++++++++++++++++++++++++++++---------------------- > 1 file changed, 187 insertions(+), 134 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 8eba631c4dbd..cd5f29a57880 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -239,159 +239,212 @@ struct __packed vmcs12 { > u32 revision_id; > u32 abort; > > + union { > + u64 hv_vmcs[255]; > + struct { > + u16 host_es_selector; > + u16 host_cs_selector; > + u16 host_ss_selector; > + u16 host_ds_selector; > + u16 host_fs_selector; > + u16 host_gs_selector; > + u16 host_tr_selector; > + > + u64 host_ia32_pat; > + u64 host_ia32_efer; > + > + /* > + * To allow migration of L1 (complete with its L2 > + * guests) between machines of different natural widths > + * (32 or 64 bit), we cannot have unsigned long fields > + * with no explicit size. We use u64 (aliased > + * natural_width) instead. Luckily, x86 is > + * little-endian. > + */ > + natural_width host_cr0; > + natural_width host_cr3; > + natural_width host_cr4; > + > + natural_width host_ia32_sysenter_esp; > + natural_width host_ia32_sysenter_eip; > + natural_width host_rip; > + u32 host_ia32_sysenter_cs; > + > + u32 pin_based_vm_exec_control; > + u32 vm_exit_controls; > + u32 secondary_vm_exec_control; > + > + u64 io_bitmap_a; > + u64 io_bitmap_b; > + u64 msr_bitmap; > + > + u16 guest_es_selector; > + u16 guest_cs_selector; > + u16 guest_ss_selector; > + u16 guest_ds_selector; > + u16 guest_fs_selector; > + u16 guest_gs_selector; > + u16 guest_ldtr_selector; > + u16 guest_tr_selector; > + > + u32 guest_es_limit; > + u32 guest_cs_limit; > + u32 guest_ss_limit; > + u32 guest_ds_limit; > + u32 guest_fs_limit; > + u32 guest_gs_limit; > + u32 guest_ldtr_limit; > + u32 guest_tr_limit; > + u32 guest_gdtr_limit; > + u32 guest_idtr_limit; > + > + u32 guest_es_ar_bytes; > + u32 guest_cs_ar_bytes; > + u32 guest_ss_ar_bytes; > + u32 guest_ds_ar_bytes; > + u32 guest_fs_ar_bytes; > + u32 guest_gs_ar_bytes; > + u32 guest_ldtr_ar_bytes; > + u32 guest_tr_ar_bytes; > + > + natural_width guest_es_base; > + natural_width guest_cs_base; > + natural_width guest_ss_base; > + natural_width guest_ds_base; > + natural_width guest_fs_base; > + natural_width guest_gs_base; > + natural_width guest_ldtr_base; > + natural_width guest_tr_base; > + natural_width guest_gdtr_base; > + natural_width guest_idtr_base; > + > + u64 padding64_1[3]; > + > + u64 vm_exit_msr_store_addr; > + u64 vm_exit_msr_load_addr; > + u64 vm_entry_msr_load_addr; > + > + natural_width cr3_target_value0; > + natural_width cr3_target_value1; > + natural_width cr3_target_value2; > + natural_width cr3_target_value3; > + > + u32 page_fault_error_code_mask; > + u32 page_fault_error_code_match; > + > + u32 cr3_target_count; > + u32 vm_exit_msr_store_count; > + u32 vm_exit_msr_load_count; > + u32 vm_entry_msr_load_count; > + > + u64 tsc_offset; > + u64 virtual_apic_page_addr; > + u64 vmcs_link_pointer; > + > + u64 guest_ia32_debugctl; > + u64 guest_ia32_pat; > + u64 guest_ia32_efer; > + > + u64 guest_pdptr0; > + u64 guest_pdptr1; > + u64 guest_pdptr2; > + u64 guest_pdptr3; > + > + natural_width guest_pending_dbg_exceptions; > + natural_width guest_sysenter_esp; > + natural_width guest_sysenter_eip; > + > + u32 guest_activity_state; > + u32 guest_sysenter_cs; > + > + natural_width cr0_guest_host_mask; > + natural_width cr4_guest_host_mask; > + natural_width cr0_read_shadow; > + natural_width cr4_read_shadow; > + natural_width guest_cr0; > + natural_width guest_cr3; > + natural_width guest_cr4; > + natural_width guest_dr7; > + > + natural_width host_fs_base; > + natural_width host_gs_base; > + natural_width host_tr_base; > + natural_width host_gdtr_base; > + natural_width host_idtr_base; > + natural_width host_rsp; > + > + u64 ept_pointer; > + > + u16 virtual_processor_id; > + u16 padding16[3]; > + > + u64 padding64_2[5]; > + u64 guest_physical_address; > + > + u32 vm_instruction_error; > + u32 vm_exit_reason; > + u32 vm_exit_intr_info; > + u32 vm_exit_intr_error_code; > + u32 idt_vectoring_info_field; > + u32 idt_vectoring_error_code; > + u32 vm_exit_instruction_len; > + u32 vmx_instruction_info; > + > + natural_width exit_qualification; > + natural_width padding64_3[4]; > + > + natural_width guest_linear_address; > + natural_width guest_rsp; > + natural_width guest_rflags; > + > + u32 guest_interruptibility_info; > + u32 cpu_based_vm_exec_control; > + u32 exception_bitmap; > + u32 vm_entry_controls; > + u32 vm_entry_intr_info_field; > + u32 vm_entry_exception_error_code; > + u32 vm_entry_instruction_len; > + u32 tpr_threshold; > + > + natural_width guest_rip; > + > + u32 hv_clean_fields; > + u32 hv_padding_32; > + u32 hv_synthetic_controls; > + u32 hv_enlightenments_control; > + u32 hv_vp_id; > + > + u64 hv_vm_id; > + u64 partition_assist_page; > + u64 padding64_4[4]; > + u64 guest_bndcfgs; > + u64 padding64_5[7]; > + u64 xss_exit_bitmap; > + u64 padding64_6[7]; > + }; > + }; > + > + /* Synthetic and KVM-specific fields: */ > u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ > u32 padding[7]; /* room for future expansion */ > > - u64 io_bitmap_a; > - u64 io_bitmap_b; > - u64 msr_bitmap; > - u64 vm_exit_msr_store_addr; > - u64 vm_exit_msr_load_addr; > - u64 vm_entry_msr_load_addr; > - u64 tsc_offset; > - u64 virtual_apic_page_addr; > u64 apic_access_addr; > u64 posted_intr_desc_addr; > u64 vm_function_control; > - u64 ept_pointer; > u64 eoi_exit_bitmap0; > u64 eoi_exit_bitmap1; > u64 eoi_exit_bitmap2; > u64 eoi_exit_bitmap3; > u64 eptp_list_address; > - u64 xss_exit_bitmap; > - u64 guest_physical_address; > - u64 vmcs_link_pointer; > u64 pml_address; > - u64 guest_ia32_debugctl; > - u64 guest_ia32_pat; > - u64 guest_ia32_efer; > u64 guest_ia32_perf_global_ctrl; > - u64 guest_pdptr0; > - u64 guest_pdptr1; > - u64 guest_pdptr2; > - u64 guest_pdptr3; > - u64 guest_bndcfgs; > - u64 host_ia32_pat; > - u64 host_ia32_efer; > u64 host_ia32_perf_global_ctrl; > u64 padding64[8]; /* room for future expansion */ > - /* > - * To allow migration of L1 (complete with its L2 guests) between > - * machines of different natural widths (32 or 64 bit), we cannot have > - * unsigned long fields with no explict size. We use u64 (aliased > - * natural_width) instead. Luckily, x86 is little-endian. > - */ > - natural_width cr0_guest_host_mask; > - natural_width cr4_guest_host_mask; > - natural_width cr0_read_shadow; > - natural_width cr4_read_shadow; > - natural_width cr3_target_value0; > - natural_width cr3_target_value1; > - natural_width cr3_target_value2; > - natural_width cr3_target_value3; > - natural_width exit_qualification; > - natural_width guest_linear_address; > - natural_width guest_cr0; > - natural_width guest_cr3; > - natural_width guest_cr4; > - natural_width guest_es_base; > - natural_width guest_cs_base; > - natural_width guest_ss_base; > - natural_width guest_ds_base; > - natural_width guest_fs_base; > - natural_width guest_gs_base; > - natural_width guest_ldtr_base; > - natural_width guest_tr_base; > - natural_width guest_gdtr_base; > - natural_width guest_idtr_base; > - natural_width guest_dr7; > - natural_width guest_rsp; > - natural_width guest_rip; > - natural_width guest_rflags; > - natural_width guest_pending_dbg_exceptions; > - natural_width guest_sysenter_esp; > - natural_width guest_sysenter_eip; > - natural_width host_cr0; > - natural_width host_cr3; > - natural_width host_cr4; > - natural_width host_fs_base; > - natural_width host_gs_base; > - natural_width host_tr_base; > - natural_width host_gdtr_base; > - natural_width host_idtr_base; > - natural_width host_ia32_sysenter_esp; > - natural_width host_ia32_sysenter_eip; > - natural_width host_rsp; > - natural_width host_rip; > - natural_width paddingl[8]; /* room for future expansion */ > - u32 pin_based_vm_exec_control; > - u32 cpu_based_vm_exec_control; > - u32 exception_bitmap; > - u32 page_fault_error_code_mask; > - u32 page_fault_error_code_match; > - u32 cr3_target_count; > - u32 vm_exit_controls; > - u32 vm_exit_msr_store_count; > - u32 vm_exit_msr_load_count; > - u32 vm_entry_controls; > - u32 vm_entry_msr_load_count; > - u32 vm_entry_intr_info_field; > - u32 vm_entry_exception_error_code; > - u32 vm_entry_instruction_len; > - u32 tpr_threshold; > - u32 secondary_vm_exec_control; > - u32 vm_instruction_error; > - u32 vm_exit_reason; > - u32 vm_exit_intr_info; > - u32 vm_exit_intr_error_code; > - u32 idt_vectoring_info_field; > - u32 idt_vectoring_error_code; > - u32 vm_exit_instruction_len; > - u32 vmx_instruction_info; > - u32 guest_es_limit; > - u32 guest_cs_limit; > - u32 guest_ss_limit; > - u32 guest_ds_limit; > - u32 guest_fs_limit; > - u32 guest_gs_limit; > - u32 guest_ldtr_limit; > - u32 guest_tr_limit; > - u32 guest_gdtr_limit; > - u32 guest_idtr_limit; > - u32 guest_es_ar_bytes; > - u32 guest_cs_ar_bytes; > - u32 guest_ss_ar_bytes; > - u32 guest_ds_ar_bytes; > - u32 guest_fs_ar_bytes; > - u32 guest_gs_ar_bytes; > - u32 guest_ldtr_ar_bytes; > - u32 guest_tr_ar_bytes; > - u32 guest_interruptibility_info; > - u32 guest_activity_state; > - u32 guest_sysenter_cs; > - u32 host_ia32_sysenter_cs; > u32 vmx_preemption_timer_value; > u32 padding32[7]; /* room for future expansion */ > - u16 virtual_processor_id; > u16 posted_intr_nv; > - u16 guest_es_selector; > - u16 guest_cs_selector; > - u16 guest_ss_selector; > - u16 guest_ds_selector; > - u16 guest_fs_selector; > - u16 guest_gs_selector; > - u16 guest_ldtr_selector; > - u16 guest_tr_selector; > u16 guest_intr_status; > u16 guest_pml_index; > - u16 host_es_selector; > - u16 host_cs_selector; > - u16 host_ss_selector; > - u16 host_ds_selector; > - u16 host_fs_selector; > - u16 host_gs_selector; > - u16 host_tr_selector; > }; > > /* > @@ -399,7 +452,7 @@ struct __packed vmcs12 { > * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and > * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. > */ > -#define VMCS12_REVISION 0x11e57ed0 > +#define VMCS12_REVISION 0x11e586b1 > > /* > * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region > -- > 2.14.3 >
At this point in time, I don't think you can just blithely change the virtual VMCS layout and revision number. Existing VMs using the old layout and revision number must continue to work on versions of kvm past this point. You could tie the layout and revision number changes to KVM_CAP_HYPERV_ENLIGHTENED_VMCS if you like, but kvm must be able to continue to service VMs using the previous layout and revision number in perpetuity. On Mon, Dec 18, 2017 at 12:23 PM, Jim Mattson <jmattson@google.com> wrote: > Yikes! This breaks migration to/from older versions of kvm. Will you > be submitting another change to handle dynamic conversion between > formats? > > On Mon, Dec 18, 2017 at 9:17 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: >> From: Ladi Prosek <lprosek@redhat.com> >> >> Reorders existing fields and adds fields specific to Hyper-V. The layout >> now matches Hyper-V TLFS 5.0b 16.11.2 Enlightened VMCS. >> >> Fields used by KVM but missing from Hyper-V are placed in the second half >> of the VMCS page to minimize the chances they will clash with future >> enlightened VMCS versions. >> >> Signed-off-by: Ladi Prosek <lprosek@redhat.com> >> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> >> --- >> [Vitaly]: Update VMCS12_REVISION to some new arbitrary number. >> --- >> arch/x86/kvm/vmx.c | 321 +++++++++++++++++++++++++++++++---------------------- >> 1 file changed, 187 insertions(+), 134 deletions(-) >> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index 8eba631c4dbd..cd5f29a57880 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -239,159 +239,212 @@ struct __packed vmcs12 { >> u32 revision_id; >> u32 abort; >> >> + union { >> + u64 hv_vmcs[255]; >> + struct { >> + u16 host_es_selector; >> + u16 host_cs_selector; >> + u16 host_ss_selector; >> + u16 host_ds_selector; >> + u16 host_fs_selector; >> + u16 host_gs_selector; >> + u16 host_tr_selector; >> + >> + u64 host_ia32_pat; >> + u64 host_ia32_efer; >> + >> + /* >> + * To allow migration of L1 (complete with its L2 >> + * guests) between machines of different natural widths >> + * (32 or 64 bit), we cannot have unsigned long fields >> + * with no explicit size. We use u64 (aliased >> + * natural_width) instead. Luckily, x86 is >> + * little-endian. >> + */ >> + natural_width host_cr0; >> + natural_width host_cr3; >> + natural_width host_cr4; >> + >> + natural_width host_ia32_sysenter_esp; >> + natural_width host_ia32_sysenter_eip; >> + natural_width host_rip; >> + u32 host_ia32_sysenter_cs; >> + >> + u32 pin_based_vm_exec_control; >> + u32 vm_exit_controls; >> + u32 secondary_vm_exec_control; >> + >> + u64 io_bitmap_a; >> + u64 io_bitmap_b; >> + u64 msr_bitmap; >> + >> + u16 guest_es_selector; >> + u16 guest_cs_selector; >> + u16 guest_ss_selector; >> + u16 guest_ds_selector; >> + u16 guest_fs_selector; >> + u16 guest_gs_selector; >> + u16 guest_ldtr_selector; >> + u16 guest_tr_selector; >> + >> + u32 guest_es_limit; >> + u32 guest_cs_limit; >> + u32 guest_ss_limit; >> + u32 guest_ds_limit; >> + u32 guest_fs_limit; >> + u32 guest_gs_limit; >> + u32 guest_ldtr_limit; >> + u32 guest_tr_limit; >> + u32 guest_gdtr_limit; >> + u32 guest_idtr_limit; >> + >> + u32 guest_es_ar_bytes; >> + u32 guest_cs_ar_bytes; >> + u32 guest_ss_ar_bytes; >> + u32 guest_ds_ar_bytes; >> + u32 guest_fs_ar_bytes; >> + u32 guest_gs_ar_bytes; >> + u32 guest_ldtr_ar_bytes; >> + u32 guest_tr_ar_bytes; >> + >> + natural_width guest_es_base; >> + natural_width guest_cs_base; >> + natural_width guest_ss_base; >> + natural_width guest_ds_base; >> + natural_width guest_fs_base; >> + natural_width guest_gs_base; >> + natural_width guest_ldtr_base; >> + natural_width guest_tr_base; >> + natural_width guest_gdtr_base; >> + natural_width guest_idtr_base; >> + >> + u64 padding64_1[3]; >> + >> + u64 vm_exit_msr_store_addr; >> + u64 vm_exit_msr_load_addr; >> + u64 vm_entry_msr_load_addr; >> + >> + natural_width cr3_target_value0; >> + natural_width cr3_target_value1; >> + natural_width cr3_target_value2; >> + natural_width cr3_target_value3; >> + >> + u32 page_fault_error_code_mask; >> + u32 page_fault_error_code_match; >> + >> + u32 cr3_target_count; >> + u32 vm_exit_msr_store_count; >> + u32 vm_exit_msr_load_count; >> + u32 vm_entry_msr_load_count; >> + >> + u64 tsc_offset; >> + u64 virtual_apic_page_addr; >> + u64 vmcs_link_pointer; >> + >> + u64 guest_ia32_debugctl; >> + u64 guest_ia32_pat; >> + u64 guest_ia32_efer; >> + >> + u64 guest_pdptr0; >> + u64 guest_pdptr1; >> + u64 guest_pdptr2; >> + u64 guest_pdptr3; >> + >> + natural_width guest_pending_dbg_exceptions; >> + natural_width guest_sysenter_esp; >> + natural_width guest_sysenter_eip; >> + >> + u32 guest_activity_state; >> + u32 guest_sysenter_cs; >> + >> + natural_width cr0_guest_host_mask; >> + natural_width cr4_guest_host_mask; >> + natural_width cr0_read_shadow; >> + natural_width cr4_read_shadow; >> + natural_width guest_cr0; >> + natural_width guest_cr3; >> + natural_width guest_cr4; >> + natural_width guest_dr7; >> + >> + natural_width host_fs_base; >> + natural_width host_gs_base; >> + natural_width host_tr_base; >> + natural_width host_gdtr_base; >> + natural_width host_idtr_base; >> + natural_width host_rsp; >> + >> + u64 ept_pointer; >> + >> + u16 virtual_processor_id; >> + u16 padding16[3]; >> + >> + u64 padding64_2[5]; >> + u64 guest_physical_address; >> + >> + u32 vm_instruction_error; >> + u32 vm_exit_reason; >> + u32 vm_exit_intr_info; >> + u32 vm_exit_intr_error_code; >> + u32 idt_vectoring_info_field; >> + u32 idt_vectoring_error_code; >> + u32 vm_exit_instruction_len; >> + u32 vmx_instruction_info; >> + >> + natural_width exit_qualification; >> + natural_width padding64_3[4]; >> + >> + natural_width guest_linear_address; >> + natural_width guest_rsp; >> + natural_width guest_rflags; >> + >> + u32 guest_interruptibility_info; >> + u32 cpu_based_vm_exec_control; >> + u32 exception_bitmap; >> + u32 vm_entry_controls; >> + u32 vm_entry_intr_info_field; >> + u32 vm_entry_exception_error_code; >> + u32 vm_entry_instruction_len; >> + u32 tpr_threshold; >> + >> + natural_width guest_rip; >> + >> + u32 hv_clean_fields; >> + u32 hv_padding_32; >> + u32 hv_synthetic_controls; >> + u32 hv_enlightenments_control; >> + u32 hv_vp_id; >> + >> + u64 hv_vm_id; >> + u64 partition_assist_page; >> + u64 padding64_4[4]; >> + u64 guest_bndcfgs; >> + u64 padding64_5[7]; >> + u64 xss_exit_bitmap; >> + u64 padding64_6[7]; >> + }; >> + }; >> + >> + /* Synthetic and KVM-specific fields: */ >> u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ >> u32 padding[7]; /* room for future expansion */ >> >> - u64 io_bitmap_a; >> - u64 io_bitmap_b; >> - u64 msr_bitmap; >> - u64 vm_exit_msr_store_addr; >> - u64 vm_exit_msr_load_addr; >> - u64 vm_entry_msr_load_addr; >> - u64 tsc_offset; >> - u64 virtual_apic_page_addr; >> u64 apic_access_addr; >> u64 posted_intr_desc_addr; >> u64 vm_function_control; >> - u64 ept_pointer; >> u64 eoi_exit_bitmap0; >> u64 eoi_exit_bitmap1; >> u64 eoi_exit_bitmap2; >> u64 eoi_exit_bitmap3; >> u64 eptp_list_address; >> - u64 xss_exit_bitmap; >> - u64 guest_physical_address; >> - u64 vmcs_link_pointer; >> u64 pml_address; >> - u64 guest_ia32_debugctl; >> - u64 guest_ia32_pat; >> - u64 guest_ia32_efer; >> u64 guest_ia32_perf_global_ctrl; >> - u64 guest_pdptr0; >> - u64 guest_pdptr1; >> - u64 guest_pdptr2; >> - u64 guest_pdptr3; >> - u64 guest_bndcfgs; >> - u64 host_ia32_pat; >> - u64 host_ia32_efer; >> u64 host_ia32_perf_global_ctrl; >> u64 padding64[8]; /* room for future expansion */ >> - /* >> - * To allow migration of L1 (complete with its L2 guests) between >> - * machines of different natural widths (32 or 64 bit), we cannot have >> - * unsigned long fields with no explict size. We use u64 (aliased >> - * natural_width) instead. Luckily, x86 is little-endian. >> - */ >> - natural_width cr0_guest_host_mask; >> - natural_width cr4_guest_host_mask; >> - natural_width cr0_read_shadow; >> - natural_width cr4_read_shadow; >> - natural_width cr3_target_value0; >> - natural_width cr3_target_value1; >> - natural_width cr3_target_value2; >> - natural_width cr3_target_value3; >> - natural_width exit_qualification; >> - natural_width guest_linear_address; >> - natural_width guest_cr0; >> - natural_width guest_cr3; >> - natural_width guest_cr4; >> - natural_width guest_es_base; >> - natural_width guest_cs_base; >> - natural_width guest_ss_base; >> - natural_width guest_ds_base; >> - natural_width guest_fs_base; >> - natural_width guest_gs_base; >> - natural_width guest_ldtr_base; >> - natural_width guest_tr_base; >> - natural_width guest_gdtr_base; >> - natural_width guest_idtr_base; >> - natural_width guest_dr7; >> - natural_width guest_rsp; >> - natural_width guest_rip; >> - natural_width guest_rflags; >> - natural_width guest_pending_dbg_exceptions; >> - natural_width guest_sysenter_esp; >> - natural_width guest_sysenter_eip; >> - natural_width host_cr0; >> - natural_width host_cr3; >> - natural_width host_cr4; >> - natural_width host_fs_base; >> - natural_width host_gs_base; >> - natural_width host_tr_base; >> - natural_width host_gdtr_base; >> - natural_width host_idtr_base; >> - natural_width host_ia32_sysenter_esp; >> - natural_width host_ia32_sysenter_eip; >> - natural_width host_rsp; >> - natural_width host_rip; >> - natural_width paddingl[8]; /* room for future expansion */ >> - u32 pin_based_vm_exec_control; >> - u32 cpu_based_vm_exec_control; >> - u32 exception_bitmap; >> - u32 page_fault_error_code_mask; >> - u32 page_fault_error_code_match; >> - u32 cr3_target_count; >> - u32 vm_exit_controls; >> - u32 vm_exit_msr_store_count; >> - u32 vm_exit_msr_load_count; >> - u32 vm_entry_controls; >> - u32 vm_entry_msr_load_count; >> - u32 vm_entry_intr_info_field; >> - u32 vm_entry_exception_error_code; >> - u32 vm_entry_instruction_len; >> - u32 tpr_threshold; >> - u32 secondary_vm_exec_control; >> - u32 vm_instruction_error; >> - u32 vm_exit_reason; >> - u32 vm_exit_intr_info; >> - u32 vm_exit_intr_error_code; >> - u32 idt_vectoring_info_field; >> - u32 idt_vectoring_error_code; >> - u32 vm_exit_instruction_len; >> - u32 vmx_instruction_info; >> - u32 guest_es_limit; >> - u32 guest_cs_limit; >> - u32 guest_ss_limit; >> - u32 guest_ds_limit; >> - u32 guest_fs_limit; >> - u32 guest_gs_limit; >> - u32 guest_ldtr_limit; >> - u32 guest_tr_limit; >> - u32 guest_gdtr_limit; >> - u32 guest_idtr_limit; >> - u32 guest_es_ar_bytes; >> - u32 guest_cs_ar_bytes; >> - u32 guest_ss_ar_bytes; >> - u32 guest_ds_ar_bytes; >> - u32 guest_fs_ar_bytes; >> - u32 guest_gs_ar_bytes; >> - u32 guest_ldtr_ar_bytes; >> - u32 guest_tr_ar_bytes; >> - u32 guest_interruptibility_info; >> - u32 guest_activity_state; >> - u32 guest_sysenter_cs; >> - u32 host_ia32_sysenter_cs; >> u32 vmx_preemption_timer_value; >> u32 padding32[7]; /* room for future expansion */ >> - u16 virtual_processor_id; >> u16 posted_intr_nv; >> - u16 guest_es_selector; >> - u16 guest_cs_selector; >> - u16 guest_ss_selector; >> - u16 guest_ds_selector; >> - u16 guest_fs_selector; >> - u16 guest_gs_selector; >> - u16 guest_ldtr_selector; >> - u16 guest_tr_selector; >> u16 guest_intr_status; >> u16 guest_pml_index; >> - u16 host_es_selector; >> - u16 host_cs_selector; >> - u16 host_ss_selector; >> - u16 host_ds_selector; >> - u16 host_fs_selector; >> - u16 host_gs_selector; >> - u16 host_tr_selector; >> }; >> >> /* >> @@ -399,7 +452,7 @@ struct __packed vmcs12 { >> * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and >> * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. >> */ >> -#define VMCS12_REVISION 0x11e57ed0 >> +#define VMCS12_REVISION 0x11e586b1 >> >> /* >> * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region >> -- >> 2.14.3 >>
Jim Mattson <jmattson@google.com> writes: > At this point in time, I don't think you can just blithely change the > virtual VMCS layout and revision number. Existing VMs using the old > layout and revision number must continue to work on versions of kvm > past this point. You could tie the layout and revision number changes > to KVM_CAP_HYPERV_ENLIGHTENED_VMCS if you like, but kvm must be able > to continue to service VMs using the previous layout and revision > number in perpetuity. > I see what you mean. In case we need to keep migration of nested workloads working between KVMs of different versions we can't (ever) touch vmcs12. The way to go in this case, I think, is to create a completely separate enlightened_vmcs12 struct and use it when appropriate. We can't possibly support migrating workloads which use enlightened VMCS to an old KVM which doesn't support it. P.S. "If there are changes in this struct, VMCS12_REVISION must be changed." comment needs to be replaced with "Don't even think about changing this" :-)
On 19/12/2017 13:25, Vitaly Kuznetsov wrote: > >> At this point in time, I don't think you can just blithely change the >> virtual VMCS layout and revision number. Existing VMs using the old >> layout and revision number must continue to work on versions of kvm >> past this point. You could tie the layout and revision number changes >> to KVM_CAP_HYPERV_ENLIGHTENED_VMCS if you like, but kvm must be able >> to continue to service VMs using the previous layout and revision >> number in perpetuity. >> > I see what you mean. In case we need to keep migration of nested > workloads working between KVMs of different versions we can't (ever) > touch vmcs12. Actually we can, for two reasons. First, the active VMCS is stored in host RAM (not in guest RAM). This means there are clear points where to do the translation, namely vmptrld and the (not yet upstream) ioctl to set VMX state. Therefore you only need to keep an (offset, type) map from old to new layout map; at those two points if you detect an old VMCS12_REVISION you copy the fields one by one instead of doing a memcpy. The next vmclear or vmptrld or get-VMX-state ioctl will automatically update to the new VMCS12_REVISION. Of course, this is a one-way street unless you also add support for writing old VMCS12_REVISIONs. But, second, VMX state migration is not upstream yet, so nested hypervisors are currently not migratable: the active VMCS12 state will not be migrated at all! So in upstream KVM we wouldn't even need to upgrade the VMCS12_REVISION to make changes to vmcs12. That said... > The way to go in this case, I think, is to create a completely separate > enlightened_vmcs12 struct and use it when appropriate. We can't possibly > support migrating workloads which use enlightened VMCS to an old KVM > which doesn't support it. ... this is probably a good idea as well. Paolo
On Tue, Dec 19, 2017 at 4:37 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: > On 19/12/2017 13:25, Vitaly Kuznetsov wrote: >> >>> At this point in time, I don't think you can just blithely change the >>> virtual VMCS layout and revision number. Existing VMs using the old >>> layout and revision number must continue to work on versions of kvm >>> past this point. You could tie the layout and revision number changes >>> to KVM_CAP_HYPERV_ENLIGHTENED_VMCS if you like, but kvm must be able >>> to continue to service VMs using the previous layout and revision >>> number in perpetuity. >>> >> I see what you mean. In case we need to keep migration of nested >> workloads working between KVMs of different versions we can't (ever) >> touch vmcs12. > > Actually we can, for two reasons. > > First, the active VMCS is stored in host RAM (not in guest RAM). This > means there are clear points where to do the translation, namely vmptrld > and the (not yet upstream) ioctl to set VMX state. > > Therefore you only need to keep an (offset, type) map from old to new > layout map; at those two points if you detect an old VMCS12_REVISION you > copy the fields one by one instead of doing a memcpy. The next vmclear > or vmptrld or get-VMX-state ioctl will automatically update to the new > VMCS12_REVISION. Of course, this is a one-way street unless you also > add support for writing old VMCS12_REVISIONs. I'm not sure that's really the right way to go, since any guest that has already read the IA32_VMX_BASIC MSR has a right to expect the VMCS revision to remain unchanged. Also, this breaks migration back to older versions of kvm, even for VMs that are not making use of the enlightened VMCS. It would be nice to be able to maintain the ability for an older VM to run on a newer kvm version without picking up any taint that would make it impossible to migrate it back to the older kvm version where it started. > But, second, VMX state migration is not upstream yet, so nested > hypervisors are currently not migratable: the active VMCS12 state will > not be migrated at all! So in upstream KVM we wouldn't even need to > upgrade the VMCS12_REVISION to make changes to vmcs12. Mea culpa. Perhaps this is the motivation I needed to prioritize incorporating the community feedback on that change set. > That said... > >> The way to go in this case, I think, is to create a completely separate >> enlightened_vmcs12 struct and use it when appropriate. We can't possibly >> support migrating workloads which use enlightened VMCS to an old KVM >> which doesn't support it. > > ... this is probably a good idea as well. > > Paolo
You can change the default VMCS12_REVISION and associated layout, as long as support is maintained for the old layout and userspace has the ability (e.g. by setting the IA32_VMX_BASIC MSR) to specify that a VM needs to use the old layout. On Tue, Dec 19, 2017 at 4:25 AM, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > Jim Mattson <jmattson@google.com> writes: > >> At this point in time, I don't think you can just blithely change the >> virtual VMCS layout and revision number. Existing VMs using the old >> layout and revision number must continue to work on versions of kvm >> past this point. You could tie the layout and revision number changes >> to KVM_CAP_HYPERV_ENLIGHTENED_VMCS if you like, but kvm must be able >> to continue to service VMs using the previous layout and revision >> number in perpetuity. >> > > I see what you mean. In case we need to keep migration of nested > workloads working between KVMs of different versions we can't (ever) > touch vmcs12. > > The way to go in this case, I think, is to create a completely separate > enlightened_vmcs12 struct and use it when appropriate. We can't possibly > support migrating workloads which use enlightened VMCS to an old KVM > which doesn't support it. > > P.S. "If there are changes in this struct, VMCS12_REVISION must be > changed." comment needs to be replaced with "Don't even think about > changing this" :-) > > -- > Vitaly
On 19/12/2017 18:40, Jim Mattson wrote: > I'm not sure that's really the right way to go, since any guest that > has already read the IA32_VMX_BASIC MSR has a right to expect the VMCS > revision to remain unchanged. Hmm, not just that, "the VMCS revision identifier is never written by the processor" according to the SDM. Maybe the code that accesses the vmcs12 can be placed in a .h file and included more than once in vmx.c. Paolo
Paolo Bonzini <pbonzini@redhat.com> writes: > On 19/12/2017 13:25, Vitaly Kuznetsov wrote: >> >>> At this point in time, I don't think you can just blithely change the >>> virtual VMCS layout and revision number. Existing VMs using the old >>> layout and revision number must continue to work on versions of kvm >>> past this point. You could tie the layout and revision number changes >>> to KVM_CAP_HYPERV_ENLIGHTENED_VMCS if you like, but kvm must be able >>> to continue to service VMs using the previous layout and revision >>> number in perpetuity. >>> >> I see what you mean. In case we need to keep migration of nested >> workloads working between KVMs of different versions we can't (ever) >> touch vmcs12. > > Actually we can, for two reasons. > > First, the active VMCS is stored in host RAM (not in guest RAM). This > means there are clear points where to do the translation, namely vmptrld > and the (not yet upstream) ioctl to set VMX state. > > Therefore you only need to keep an (offset, type) map from old to new > layout map; at those two points if you detect an old VMCS12_REVISION you > copy the fields one by one instead of doing a memcpy. The next vmclear > or vmptrld or get-VMX-state ioctl will automatically update to the new > VMCS12_REVISION. Of course, this is a one-way street unless you also > add support for writing old VMCS12_REVISIONs. > > But, second, VMX state migration is not upstream yet, so nested > hypervisors are currently not migratable: the active VMCS12 state will > not be migrated at all! So in upstream KVM we wouldn't even need to > upgrade the VMCS12_REVISION to make changes to vmcs12. > > That said... > >> The way to go in this case, I think, is to create a completely separate >> enlightened_vmcs12 struct and use it when appropriate. We can't possibly >> support migrating workloads which use enlightened VMCS to an old KVM >> which doesn't support it. > > ... this is probably a good idea as well. > One other thing I was thinking about is the shared definition of enlightened vmcs which we'll use for both KVM-on-Hyper-V and Hyper-V on KVM and for that purpose I'd like it to be placed outside of struct vmcs12. We can, of course, embed it at the beginning of vmcs12. Thinking long term (and having in mind that Microsoft will be updating enlightened VMCS on its own schedule) -- what would be the preferred way to go? It seems that personally I'm leaning towards untangling and keeping it separate from vmcs12 but I can't really find a convincing argument...
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eba631c4dbd..cd5f29a57880 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -239,159 +239,212 @@ struct __packed vmcs12 { u32 revision_id; u32 abort; + union { + u64 hv_vmcs[255]; + struct { + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + + u64 host_ia32_pat; + u64 host_ia32_efer; + + /* + * To allow migration of L1 (complete with its L2 + * guests) between machines of different natural widths + * (32 or 64 bit), we cannot have unsigned long fields + * with no explicit size. We use u64 (aliased + * natural_width) instead. Luckily, x86 is + * little-endian. + */ + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rip; + u32 host_ia32_sysenter_cs; + + u32 pin_based_vm_exec_control; + u32 vm_exit_controls; + u32 secondary_vm_exec_control; + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + + u64 padding64_1[3]; + + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + + u32 cr3_target_count; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_msr_load_count; + + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 vmcs_link_pointer; + + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + + u32 guest_activity_state; + u32 guest_sysenter_cs; + + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_dr7; + + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_rsp; + + u64 ept_pointer; + + u16 virtual_processor_id; + u16 padding16[3]; + + u64 padding64_2[5]; + u64 guest_physical_address; + + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + + natural_width exit_qualification; + natural_width padding64_3[4]; + + natural_width guest_linear_address; + natural_width guest_rsp; + natural_width guest_rflags; + + u32 guest_interruptibility_info; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 vm_entry_controls; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + + natural_width guest_rip; + + u32 hv_clean_fields; + u32 hv_padding_32; + u32 hv_synthetic_controls; + u32 hv_enlightenments_control; + u32 hv_vp_id; + + u64 hv_vm_id; + u64 partition_assist_page; + u64 padding64_4[4]; + u64 guest_bndcfgs; + u64 padding64_5[7]; + u64 xss_exit_bitmap; + u64 padding64_6[7]; + }; + }; + + /* Synthetic and KVM-specific fields: */ u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ u32 padding[7]; /* room for future expansion */ - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; u64 apic_access_addr; u64 posted_intr_desc_addr; u64 vm_function_control; - u64 ept_pointer; u64 eoi_exit_bitmap0; u64 eoi_exit_bitmap1; u64 eoi_exit_bitmap2; u64 eoi_exit_bitmap3; u64 eptp_list_address; - u64 xss_exit_bitmap; - u64 guest_physical_address; - u64 vmcs_link_pointer; u64 pml_address; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; u64 guest_ia32_perf_global_ctrl; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 guest_bndcfgs; - u64 host_ia32_pat; - u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; u64 padding64[8]; /* room for future expansion */ - /* - * To allow migration of L1 (complete with its L2 guests) between - * machines of different natural widths (32 or 64 bit), we cannot have - * unsigned long fields with no explict size. We use u64 (aliased - * natural_width) instead. Luckily, x86 is little-endian. - */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; u32 vmx_preemption_timer_value; u32 padding32[7]; /* room for future expansion */ - u16 virtual_processor_id; u16 posted_intr_nv; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; u16 guest_intr_status; u16 guest_pml_index; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; }; /* @@ -399,7 +452,7 @@ struct __packed vmcs12 { * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. */ -#define VMCS12_REVISION 0x11e57ed0 +#define VMCS12_REVISION 0x11e586b1 /* * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region