Message ID | 1457124368-2025-6-git-send-email-Suravee.Suthikulpanit@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 04/03/2016 21:46, Suravee Suthikulpanit wrote: > @@ -162,6 +170,37 @@ struct vcpu_svm { > > /* cached guest cpuid flags for faster access */ > bool nrips_enabled : 1; > + > + struct page *avic_bk_page; > + void *in_kernel_lapic_regs; > +}; > + > +struct __attribute__ ((__packed__)) > +svm_avic_log_ait_entry { > + u32 guest_phy_apic_id : 8, > + res : 23, > + valid : 1; > +}; > + > +struct __attribute__ ((__packed__)) > +svm_avic_phy_ait_entry { > + u64 host_phy_apic_id : 8, > + res1 : 4, > + bk_pg_ptr : 40, > + res2 : 10, > + is_running : 1, > + valid : 1; > +}; Please don't use bitfields. > +/* Note: This structure is per VM */ > +struct svm_vm_data { > + atomic_t count; > + u32 ldr_mode; > + u32 avic_max_vcpu_id; > + u32 avic_tag; > + > + struct page *avic_log_ait_page; > + struct page *avic_phy_ait_page; You can put these directly in kvm_arch. Do not use abbreviations: struct page *avic_logical_apic_id_table_page; struct page *avic_physical_apic_id_table_page; > @@ -1000,6 +1057,41 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) > mark_dirty(svm->vmcb, VMCB_INTERCEPTS); > } > > +static void avic_init_vmcb(struct vcpu_svm *svm) > +{ > + void *vapic_bkpg; > + struct vmcb *vmcb = svm->vmcb; > + struct svm_vm_data *vm_data = svm->vcpu.kvm->arch.arch_data; > + phys_addr_t bpa = PFN_PHYS(page_to_pfn(svm->avic_bk_page)); > + phys_addr_t lpa = PFN_PHYS(page_to_pfn(vm_data->avic_log_ait_page)); > + phys_addr_t ppa = PFN_PHYS(page_to_pfn(vm_data->avic_phy_ait_page)); Please use page_to_phys. > + if (!vmcb) > + return; > + > + pr_debug("%s: bpa=%#llx, lpa=%#llx, ppa=%#llx\n", > + __func__, (unsigned long long)bpa, > + (unsigned long long)lpa, (unsigned long long)ppa); No pr_debug. > + > + /* > + * Here we copy the value from the emulated LAPIC register > + * to the vAPIC backign page, and then flip regs frame. > + */ > + if (!svm->in_kernel_lapic_regs) > + svm->in_kernel_lapic_regs = svm->vcpu.arch.apic->regs; > + > + vapic_bkpg = pfn_to_kaddr(page_to_pfn(svm->avic_bk_page)); Please use kmap instead. > + memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE); > + svm->vcpu.arch.apic->regs = vapic_bkpg; Can you explain the flipping logic, and why you cannot just use the existing apic.regs? > + vmcb->control.avic_bk_page = bpa & AVIC_HPA_MASK; No abbreviations ("avic_backing_page"). > + vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK; > + vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK; > + vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX; > + vmcb->control.avic_enable = 1; > +} > + > static void init_vmcb(struct vcpu_svm *svm) > { > struct vmcb_control_area *control = &svm->vmcb->control; > @@ -1113,6 +1205,293 @@ static void init_vmcb(struct vcpu_svm *svm) > mark_all_dirty(svm->vmcb); > > enable_gif(svm); > + > + if (avic) > + avic_init_vmcb(svm); > +} > + > +static struct svm_avic_phy_ait_entry * > +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index) > +{ > + struct svm_avic_phy_ait_entry *avic_phy_ait; > + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; > + > + if (!vm_data) > + return NULL; > + > + /* Note: APIC ID = 0xff is used for broadcast. > + * APIC ID > 0xff is reserved. > + */ > + if (index >= 0xff) > + return NULL; > + > + avic_phy_ait = page_address(vm_data->avic_phy_ait_page); > + > + return &avic_phy_ait[index]; > +} > + > +struct svm_avic_log_ait_entry * > +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat) > +{ > + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; > + int index; > + struct svm_avic_log_ait_entry *avic_log_ait; > + > + if (!vm_data) > + return NULL; > + > + if (is_flat) { /* flat */ > + if (mda > 7) > + return NULL; > + index = mda; > + } else { /* cluster */ > + int apic_id = mda & 0xf; > + int cluster_id = (mda & 0xf0) >> 8; > + > + if (apic_id > 4 || cluster_id >= 0xf) > + return NULL; > + index = (cluster_id << 2) + apic_id; > + } > + avic_log_ait = (struct svm_avic_log_ait_entry *) > + page_address(vm_data->avic_log_ait_page); > + > + return &avic_log_ait[index]; > +} Instead of these functions, create a complete function to handle APIC_ID and APIC_LDR writes. Then use kmap/kunmap instead of page_address. > +static inline void > +avic_set_bk_page_entry(struct vcpu_svm *svm, int reg_off, u32 val) > +{ > + void *avic_bk = page_address(svm->avic_bk_page); > + > + *((u32 *) (avic_bk + reg_off)) = val; > +} Unused function. > +static inline u32 *avic_get_bk_page_entry(struct vcpu_svm *svm, u32 offset) > +{ > + char *tmp = (char *)page_address(svm->avic_bk_page); > + > + return (u32 *)(tmp+offset); > +} I think you should be able to use kvm_apic_get_reg instead. > +static int avic_init_bk_page(struct kvm_vcpu *vcpu) No abbreviations (but again, please try reusing apic.regs). > +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id) > +{ > + int ret = 0, i; > + bool realloc = false; > + struct kvm_vcpu *vcpu; > + struct kvm *kvm = svm->vcpu.kvm; > + struct svm_vm_data *vm_data = kvm->arch.arch_data; > + > + mutex_lock(&kvm->slots_lock); > + > + /* Check if we have already allocated vAPIC backing > + * page for this vCPU. If not, we need to realloc > + * a new one and re-assign all other vCPU. > + */ > + if (kvm->arch.apic_access_page_done && > + (id > vm_data->avic_max_vcpu_id)) { > + kvm_for_each_vcpu(i, vcpu, kvm) > + avic_unalloc_bk_page(vcpu); > + > + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, > + 0, 0); > + realloc = true; > + vm_data->avic_max_vcpu_id = 0; > + } > + > + /* > + * We are allocating vAPIC backing page > + * upto the max vCPU ID > + */ > + if (id >= vm_data->avic_max_vcpu_id) { > + ret = __x86_set_memory_region(kvm, > + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, > + APIC_DEFAULT_PHYS_BASE, > + PAGE_SIZE * (id + 1)); Why is this necessary? The APIC access page is a peculiarity of Intel processors (and the special memslot for only needs to map 0xfee00000 to 0xfee00fff; after that there is the MSI area). > + if (ret) > + goto out; > + > + vm_data->avic_max_vcpu_id = id; > + } > + > + /* Reinit vAPIC backing page for exisinting vcpus */ > + if (realloc) > + kvm_for_each_vcpu(i, vcpu, kvm) > + avic_init_bk_page(vcpu); Why is this necessary? > + avic_init_bk_page(&svm->vcpu); > + > + kvm->arch.apic_access_page_done = true; > + > +out: > + mutex_unlock(&kvm->slots_lock); > + return ret; > +} > + > +static void avic_vm_uninit(struct kvm *kvm) > +{ > + struct svm_vm_data *vm_data = kvm->arch.arch_data; > + > + if (!vm_data) > + return; > + > + if (vm_data->avic_log_ait_page) > + __free_page(vm_data->avic_log_ait_page); > + if (vm_data->avic_phy_ait_page) > + __free_page(vm_data->avic_phy_ait_page); > + kfree(vm_data); > + kvm->arch.arch_data = NULL; > +} > + > +static void avic_vcpu_uninit(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_svm *svm = to_svm(vcpu); > + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; > + > + if (!avic) > + return; > + > + avic_unalloc_bk_page(vcpu); > + svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs; > + svm->in_kernel_lapic_regs = NULL; > + > + if (vm_data && > + (atomic_read(&vm_data->count) == 0 || > + atomic_dec_and_test(&vm_data->count))) > + avic_vm_uninit(vcpu->kvm); Add a new kvm_x86_ops callback .free_vcpus and call it from kvm_free_vcpus; then count is not necessary. We probably should use it for Intel too. The calls to x86_set_memory_region in kvm_arch_destroy_vm are hideous. > +} > + > +static atomic_t avic_tag_gen = ATOMIC_INIT(1); > + > +static inline u32 avic_get_next_tag(void) > +{ > + u32 tag = atomic_read(&avic_tag_gen); > + > + atomic_inc(&avic_tag_gen); > + return tag; > +} > + > +static int avic_vm_init(struct kvm *kvm) > +{ > + int err = -ENOMEM; > + struct svm_vm_data *vm_data; > + struct page *avic_phy_ait_page; > + struct page *avic_log_ait_page; This is probably also best moved to a new callback .init_vm (called from kvm_arch_init_vm). > + vm_data = kzalloc(sizeof(struct svm_vm_data), > + GFP_KERNEL); > + if (!vm_data) > + return err; > + > + kvm->arch.arch_data = vm_data; > + atomic_set(&vm_data->count, 0); > + > + /* Allocating physical APIC ID table (4KB) */ > + avic_phy_ait_page = alloc_page(GFP_KERNEL); > + if (!avic_phy_ait_page) > + goto free_avic; > + > + vm_data->avic_phy_ait_page = avic_phy_ait_page; > + clear_page(page_address(avic_phy_ait_page)); You can use get_zeroed_page and free_page, instead of alloc_page/clear_page/__free_page. Thanks, Paolo > + /* Allocating logical APIC ID table (4KB) */ > + avic_log_ait_page = alloc_page(GFP_KERNEL); > + if (!avic_log_ait_page) > + goto free_avic; > + > + vm_data->avic_log_ait_page = avic_log_ait_page; > + clear_page(page_address(avic_log_ait_page)); > + > + vm_data->avic_tag = avic_get_next_tag(); > + > + return 0; > + > +free_avic: > + avic_vm_uninit(kvm); > + return err; > +} > + > +static int avic_vcpu_init(struct kvm *kvm, struct vcpu_svm *svm, int id) > +{ > + int err; > + struct svm_vm_data *vm_data = NULL; > + > + /* Note: svm_vm_data is per VM */ > + if (!kvm->arch.arch_data) { > + err = avic_vm_init(kvm); > + if (err) > + return err; > + } > + > + err = avic_alloc_bk_page(svm, id); > + if (err) { > + avic_vcpu_uninit(&svm->vcpu); > + return err; > + } > + > + vm_data = kvm->arch.arch_data; > + atomic_inc(&vm_data->count); > + > + return 0; > } > > static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > @@ -1131,6 +1510,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); > kvm_register_write(vcpu, VCPU_REGS_RDX, eax); > + > + if (avic && !init_event) > + avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); > } > > static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > @@ -1169,6 +1551,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > if (!hsave_page) > goto free_page3; > > + if (avic) { > + err = avic_vcpu_init(kvm, svm, id); > + if (err) > + goto free_page4; > + } > + > svm->nested.hsave = page_address(hsave_page); > > svm->msrpm = page_address(msrpm_pages); > @@ -1187,6 +1575,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > > return &svm->vcpu; > > +free_page4: > + __free_page(hsave_page); > free_page3: > __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); > free_page2: > @@ -1209,6 +1599,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) > __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); > __free_page(virt_to_page(svm->nested.hsave)); > __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); > + avic_vcpu_uninit(vcpu); > kvm_vcpu_uninit(vcpu); > kmem_cache_free(kvm_vcpu_cache, svm); > } > @@ -3382,6 +3773,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) > pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); > pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); > pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); > + pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); > pr_err("%-20s%08x\n", "event_inj:", control->event_inj); > pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); > pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); > @@ -3613,11 +4005,31 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) > > static bool svm_get_enable_apicv(void) > { > - return false; > + return avic; > +} > + > +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) > +{ > } > > +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) > +{ > +} > + > +/* Note: Currently only used by Hyper-V. */ > static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) > { > + struct vcpu_svm *svm = to_svm(vcpu); > + struct vmcb *vmcb = svm->vmcb; > + > + if (!avic) > + return; > + > + if (!svm->in_kernel_lapic_regs) > + return; > + > + svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs; > + vmcb->control.avic_enable = 0; > } > > static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) > @@ -4387,6 +4799,8 @@ static struct kvm_x86_ops svm_x86_ops = { > .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, > .load_eoi_exitmap = svm_load_eoi_exitmap, > .sync_pir_to_irr = svm_sync_pir_to_irr, > + .hwapic_irr_update = svm_hwapic_irr_update, > + .hwapic_isr_update = svm_hwapic_isr_update, > > .set_tss_addr = svm_set_tss_addr, > .get_tdp_level = get_npt_level, > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi On 03/07/2016 11:41 PM, Paolo Bonzini wrote: > On 04/03/2016 21:46, Suravee Suthikulpanit wrote: > > [....] >> +/* Note: This structure is per VM */ >> +struct svm_vm_data { >> + atomic_t count; >> + u32 ldr_mode; >> + u32 avic_max_vcpu_id; >> + u32 avic_tag; >> + >> + struct page *avic_log_ait_page; >> + struct page *avic_phy_ait_page; > > You can put these directly in kvm_arch. Do not use abbreviations: > > struct page *avic_logical_apic_id_table_page; > struct page *avic_physical_apic_id_table_page; > Actually, the reason I would like to introduce this per-arch specific structure is because I feel that it is easier to manage these processor-specific variable/data-structure. If we add all these directly into kvm_arch, which is shared b/w SVM and VMX, it is more difficult to tell which one is used in the different code base. >> [...] >> + memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE); >> + svm->vcpu.arch.apic->regs = vapic_bkpg; > > Can you explain the flipping logic, and why you cannot just use the > existing apic.regs? Please see "explanation 1" below. >> [...] >> +static struct svm_avic_phy_ait_entry * >> +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index) >> +{ >> + [.....] >> +} >> + >> +struct svm_avic_log_ait_entry * >> +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat) >> +{ >> + [.....] >> +} > > Instead of these functions, create a complete function to handle APIC_ID > and APIC_LDR writes. Then use kmap/kunmap instead of page_address. > Ok. May I ask why we are against using page_address? I have see that used in several places in the code. >> [...] >> +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id) >> +{ >> + int ret = 0, i; >> + bool realloc = false; >> + struct kvm_vcpu *vcpu; >> + struct kvm *kvm = svm->vcpu.kvm; >> + struct svm_vm_data *vm_data = kvm->arch.arch_data; >> + >> + mutex_lock(&kvm->slots_lock); >> + >> + /* Check if we have already allocated vAPIC backing >> + * page for this vCPU. If not, we need to realloc >> + * a new one and re-assign all other vCPU. >> + */ >> + if (kvm->arch.apic_access_page_done && >> + (id > vm_data->avic_max_vcpu_id)) { >> + kvm_for_each_vcpu(i, vcpu, kvm) >> + avic_unalloc_bk_page(vcpu); >> + >> + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, >> + 0, 0); >> + realloc = true; >> + vm_data->avic_max_vcpu_id = 0; >> + } >> + >> + /* >> + * We are allocating vAPIC backing page >> + * upto the max vCPU ID >> + */ >> + if (id >= vm_data->avic_max_vcpu_id) { >> + ret = __x86_set_memory_region(kvm, >> + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, >> + APIC_DEFAULT_PHYS_BASE, >> + PAGE_SIZE * (id + 1)); > > Why is this necessary? The APIC access page is a peculiarity of Intel > processors (and the special memslot for only needs to map 0xfee00000 to > 0xfee00fff; after that there is the MSI area). > Please see "explanation 1" below. >> [...] >> + if (ret) >> + goto out; >> + >> + vm_data->avic_max_vcpu_id = id; >> + } >> + >> + /* Reinit vAPIC backing page for exisinting vcpus */ >> + if (realloc) >> + kvm_for_each_vcpu(i, vcpu, kvm) >> + avic_init_bk_page(vcpu); > > Why is this necessary? Explanation 1: The current lapic regs page is allocated using get_zeroed_page(), which can be paged out. If I use these pages for AVIC backing pages, it seems to cause VM to slow down quite a bit due to a lot of page faults. Currently, the AVIC backing pages are acquired from __x86_set_memory region() with APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, which maps the pages for address 0xfee00000 and above for VM to use. I mostly grab this from the VMX implementation in alloc_apic_access_page(). However, the memslot requires specification of the size at the time when calling __x86_set_memory_region(). However, I can't seem to figure out where I can get the number of vcpus at the time when we creating VM. Therefore, I have to track the vcpu creation, and re-acquire larger memslot every time vcpu_create() is called. I was not sure if this is the right approach, any suggestion for this part. Thanks, Suravee -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 15/03/2016 18:09, Suravee Suthikulpanit wrote: > Hi > > On 03/07/2016 11:41 PM, Paolo Bonzini wrote: >> On 04/03/2016 21:46, Suravee Suthikulpanit wrote: >> > [....] >>> +/* Note: This structure is per VM */ >>> +struct svm_vm_data { >>> + atomic_t count; >>> + u32 ldr_mode; >>> + u32 avic_max_vcpu_id; >>> + u32 avic_tag; >>> + >>> + struct page *avic_log_ait_page; >>> + struct page *avic_phy_ait_page; >> >> You can put these directly in kvm_arch. Do not use abbreviations: >> >> struct page *avic_logical_apic_id_table_page; >> struct page *avic_physical_apic_id_table_page; >> > > Actually, the reason I would like to introduce this per-arch specific > structure is because I feel that it is easier to manage these > processor-specific variable/data-structure. If we add all these directly > into kvm_arch, which is shared b/w SVM and VMX, it is more difficult to > tell which one is used in the different code base. You're right, but adding a pointer makes things slower and larger. Using an anonymous union would work. For now, I prefer to have the fields directly in kvm_arch. >>> [...] >>> +static struct svm_avic_phy_ait_entry * >>> +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index) >>> +{ >>> + [.....] >>> +} >>> + >>> +struct svm_avic_log_ait_entry * >>> +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat) >>> +{ >>> + [.....] >>> +} >> >> Instead of these functions, create a complete function to handle APIC_ID >> and APIC_LDR writes. Then use kmap/kunmap instead of page_address. >> > > Ok. May I ask why we are against using page_address? I have see that > used in several places in the code. You're right, I guess page_address is okay for pages that were allocated with alloc_page(). >> Why is this necessary? The APIC access page is a peculiarity of Intel >> processors (and the special memslot for only needs to map 0xfee00000 to >> 0xfee00fff; after that there is the MSI area). > > The current lapic regs page is allocated using get_zeroed_page(), which > can be paged out. If I use these pages for AVIC backing pages, it seems > to cause VM to slow down quite a bit due to a lot of page faults. What causes the lapic regs page to be paged out? > Currently, the AVIC backing pages are acquired from __x86_set_memory > region() with APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, which maps the pages for > address 0xfee00000 and above for VM to use. I mostly grab this from the > VMX implementation in alloc_apic_access_page(). > > However, the memslot requires specification of the size at the time when > calling __x86_set_memory_region(). However, I can't seem to figure out > where I can get the number of vcpus at the time when we creating VM. > Therefore, I have to track the vcpu creation, and re-acquire larger > memslot every time vcpu_create() is called. The purpose of the APIC_ACCESS_PAGE_PRIVATE_MEMSLOT is very specific: it is there to provide a mapping for 0xfee00000 because Intel processors trap writes between 0xfee00000 and 0xfee00fff, but otherwise ignore the contents of the page you map there. Intel processors only need something to compare the physical address with, they don't care about the data in the page, so they have one page per VM (they could really use a single page in the whole system---one per VM is just how KVM works right now). It is a peculiar design, and one that you should probably ignore in your AVIC patches. The AVIC backing page is more similar to Intel's "virtual-APIC page". You can see that vmx.c just uses lapic->regs for it. if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); if (cpu_need_tpr_shadow(vcpu)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi, On 03/16/2016 12:22 AM, Paolo Bonzini wrote: >>> Why is this necessary? The APIC access page is a peculiarity of Intel >>> >>processors (and the special memslot for only needs to map 0xfee00000 to >>> >>0xfee00fff; after that there is the MSI area). >> > >> >The current lapic regs page is allocated using get_zeroed_page(), which >> >can be paged out. If I use these pages for AVIC backing pages, it seems >> >to cause VM to slow down quite a bit due to a lot of page faults. > What causes the lapic regs page to be paged out? > This is mainly causing a large number of VMEXIT due to NPF. In my test running hackbench in the guest. The following are perf result profiling for 10 seconds in the host in two cases: CASE1: Using x86_set_memory_region() for AVIC backing page # ./perf-vmexit.sh 10 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 2.813 MB perf.data.guest (30356 samples) ] Analyze events for all VMs, all VCPUs: VM-EXIT Samples Samples% Time% Min Time Max Time Avg time interrupt 10042 66.30% 81.33% 0.43us 202.50us 7.43us ( +- 1.20% ) msr 5004 33.04% 15.76% 0.73us 12.21us 2.89us ( +- 0.43% ) pause 58 0.38% 0.18% 0.56us 5.88us 2.92us ( +- 6.43% ) npf 35 0.23% 2.01% 6.41us 207.78us 52.70us ( +- 23.67% ) nmi 4 0.03% 0.02% 2.31us 4.67us 3.49us ( +- 14.26% ) io 3 0.02% 0.70% 82.75us 360.90us 214.28us ( +- 37.64% ) avic_incomp_ipi 1 0.01% 0.00% 2.17us 2.17us 2.17us ( +- 0.00% ) Total Samples:15147, Total events handled time:91715.78us. CASE2: Using the lapic regs page for AVIC backing page. # ./perf-vmexit.sh 10 [ perf record: Woken up 255 times to write data ] [ perf record: Captured and wrote 509.202 MB perf.data.guest (5718856 samples) ] Analyze events for all VMs, all VCPUs: VM-EXIT Samples Samples% Time% Min Time Max Time Avg time npf 1897710 99.33% 98.08% 1.09us 243.22us 1.67us ( +- 0.04% ) interrupt 7818 0.41% 1.44% 0.44us 216.55us 5.97us ( +- 1.92% ) msr 5001 0.26% 0.45% 0.68us 12.58us 2.89us ( +- 0.50% ) pause 25 0.00% 0.00% 0.71us 4.23us 2.03us ( +- 10.76% ) io 4 0.00% 0.03% 73.91us 337.29us 206.74us ( +- 26.38% ) nmi 1 0.00% 0.00% 5.92us 5.92us 5.92us ( +- 0.00% ) Total Samples:1910559, Total events handled time:3229214.64us. Thanks, Suravee -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 16/03/2016 07:22, Suravee Suthikulpanit wrote: > This is mainly causing a large number of VMEXIT due to NPF. Got it, it's here in the manual: "System software is responsible for setting up a translation in the nested page table granting guest read and write permissions for accesses to the vAPIC Backing Page in SPA space. AVIC hardware walks the nested page table to check permissions, but does not use the SPA address specified in the leaf page table entry. Instead, AVIC hardware finds this address in the AVIC_BACKING_PAGE pointer field of the VMCB". Strictly speaking the address of the 0xFEE00000 translation is unnecessary and it could be all zeroes, but I suggest that you set up an APIC access page like Intel does (4k only), using the special memslot. The AVIC backing page can then point to lapic->regs. Thanks for the explanation! Paolo > CASE1: Using x86_set_memory_region() for AVIC backing page > > # ./perf-vmexit.sh 10 > [ perf record: Woken up 1 times to write data ] > [ perf record: Captured and wrote 2.813 MB perf.data.guest (30356 > samples) ] > > > Analyze events for all VMs, all VCPUs: > > VM-EXIT Samples Samples% Time% Min Time Max > Time Avg time > > interrupt 10042 66.30% 81.33% 0.43us > 202.50us 7.43us ( +- 1.20% ) > msr 5004 33.04% 15.76% 0.73us > 12.21us 2.89us ( +- 0.43% ) > pause 58 0.38% 0.18% 0.56us > 5.88us 2.92us ( +- 6.43% ) > npf 35 0.23% 2.01% 6.41us > 207.78us 52.70us ( +- 23.67% ) > nmi 4 0.03% 0.02% 2.31us > 4.67us 3.49us ( +- 14.26% ) > io 3 0.02% 0.70% 82.75us > 360.90us 214.28us ( +- 37.64% ) > avic_incomp_ipi 1 0.01% 0.00% 2.17us > 2.17us 2.17us ( +- 0.00% ) > > Total Samples:15147, Total events handled time:91715.78us. > > > CASE2: Using the lapic regs page for AVIC backing page. > > # ./perf-vmexit.sh 10 > [ perf record: Woken up 255 times to write data ] > [ perf record: Captured and wrote 509.202 MB perf.data.guest (5718856 > samples) ] > > > Analyze events for all VMs, all VCPUs: > > VM-EXIT Samples Samples% Time% Min Time Max > Time Avg time > > npf 1897710 99.33% 98.08% 1.09us > 243.22us 1.67us ( +- 0.04% ) > interrupt 7818 0.41% 1.44% 0.44us > 216.55us 5.97us ( +- 1.92% ) > msr 5001 0.26% 0.45% 0.68us > 12.58us 2.89us ( +- 0.50% ) > pause 25 0.00% 0.00% 0.71us > 4.23us 2.03us ( +- 10.76% ) > io 4 0.00% 0.03% 73.91us > 337.29us 206.74us ( +- 26.38% ) > nmi 1 0.00% 0.00% 5.92us > 5.92us 5.92us ( +- 0.00% ) > > Total Samples:1910559, Total events handled time:3229214.64us. > > Thanks, > Suravee > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi, On 03/16/2016 02:20 PM, Paolo Bonzini wrote: > > On 16/03/2016 07:22, Suravee Suthikulpanit wrote: >> >This is mainly causing a large number of VMEXIT due to NPF. > Got it, it's here in the manual: "System software is responsible for > setting up a translation in the nested page table granting guest read > and write permissions for accesses to the vAPIC Backing Page in SPA > space. AVIC hardware walks the nested page table to check permissions, > but does not use the SPA address specified in the leaf page table entry. > Instead, AVIC hardware finds this address in the AVIC_BACKING_PAGE > pointer field of the VMCB". > > Strictly speaking the address of the 0xFEE00000 translation is > unnecessary and it could be all zeroes, but I suggest that you set up an > APIC access page like Intel does (4k only), using the special memslot. > The AVIC backing page can then point to lapic->regs. > > Thanks for the explanation! > > Paolo > Ahh... you are right, this works also. Thanks for the pointer. I'm fixing this, doing some more testing, and cleaning up the code. This has simplify the init logic quite a bit. Thanks for suggestion, Suravee -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 16/03/2016 09:21, Suravee Suthikulpanit wrote: >> Strictly speaking the address of the 0xFEE00000 translation is >> unnecessary and it could be all zeroes, but I suggest that you set up an >> APIC access page like Intel does (4k only), using the special memslot. >> The AVIC backing page can then point to lapic->regs. > > Ahh... you are right, this works also. Thanks for the pointer. I'm > fixing this, doing some more testing, and cleaning up the code. This has > simplify the init logic quite a bit. Awesome, thanks. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9a61669..15f6cd8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -754,6 +754,8 @@ struct kvm_arch { bool irqchip_split; u8 nr_reserved_ioapic_pins; + + void *arch_data; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6ab66d0..c703149 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -14,6 +14,9 @@ * the COPYING file in the top-level directory. * */ + +#define pr_fmt(fmt) "SVM: " fmt + #include <linux/kvm_host.h> #include "irq.h" @@ -78,6 +81,11 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define TSC_RATIO_MIN 0x0000000000000001ULL #define TSC_RATIO_MAX 0x000000ffffffffffULL +#define AVIC_HPA_MASK ~((0xFFFULL << 52) || 0xFFF) + +/* NOTE: Current max index allowed for physical APIC ID table is 255 */ +#define AVIC_PHY_APIC_ID_MAX 0xFF + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -162,6 +170,37 @@ struct vcpu_svm { /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; + + struct page *avic_bk_page; + void *in_kernel_lapic_regs; +}; + +struct __attribute__ ((__packed__)) +svm_avic_log_ait_entry { + u32 guest_phy_apic_id : 8, + res : 23, + valid : 1; +}; + +struct __attribute__ ((__packed__)) +svm_avic_phy_ait_entry { + u64 host_phy_apic_id : 8, + res1 : 4, + bk_pg_ptr : 40, + res2 : 10, + is_running : 1, + valid : 1; +}; + +/* Note: This structure is per VM */ +struct svm_vm_data { + atomic_t count; + u32 ldr_mode; + u32 avic_max_vcpu_id; + u32 avic_tag; + + struct page *avic_log_ait_page; + struct page *avic_phy_ait_page; }; static DEFINE_PER_CPU(u64, current_tsc_ratio); @@ -205,6 +244,10 @@ module_param(npt, int, S_IRUGO); static int nested = true; module_param(nested, int, S_IRUGO); +/* enable / disable AVIC */ +static int avic; +module_param(avic, int, S_IRUGO); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -234,6 +277,13 @@ enum { /* TPR and CR2 are always written before VMRUN */ #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + +static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) +{ + svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; +} + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -923,6 +973,13 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); + if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC))) + avic = false; + + if (avic) { + printk(KERN_INFO "kvm: AVIC enabled\n"); + } + return 0; err: @@ -1000,6 +1057,41 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static void avic_init_vmcb(struct vcpu_svm *svm) +{ + void *vapic_bkpg; + struct vmcb *vmcb = svm->vmcb; + struct svm_vm_data *vm_data = svm->vcpu.kvm->arch.arch_data; + phys_addr_t bpa = PFN_PHYS(page_to_pfn(svm->avic_bk_page)); + phys_addr_t lpa = PFN_PHYS(page_to_pfn(vm_data->avic_log_ait_page)); + phys_addr_t ppa = PFN_PHYS(page_to_pfn(vm_data->avic_phy_ait_page)); + + if (!vmcb) + return; + + pr_debug("%s: bpa=%#llx, lpa=%#llx, ppa=%#llx\n", + __func__, (unsigned long long)bpa, + (unsigned long long)lpa, (unsigned long long)ppa); + + + /* + * Here we copy the value from the emulated LAPIC register + * to the vAPIC backign page, and then flip regs frame. + */ + if (!svm->in_kernel_lapic_regs) + svm->in_kernel_lapic_regs = svm->vcpu.arch.apic->regs; + + vapic_bkpg = pfn_to_kaddr(page_to_pfn(svm->avic_bk_page)); + memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE); + svm->vcpu.arch.apic->regs = vapic_bkpg; + + vmcb->control.avic_bk_page = bpa & AVIC_HPA_MASK; + vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK; + vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK; + vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX; + vmcb->control.avic_enable = 1; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1113,6 +1205,293 @@ static void init_vmcb(struct vcpu_svm *svm) mark_all_dirty(svm->vmcb); enable_gif(svm); + + if (avic) + avic_init_vmcb(svm); +} + +static struct svm_avic_phy_ait_entry * +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index) +{ + struct svm_avic_phy_ait_entry *avic_phy_ait; + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; + + if (!vm_data) + return NULL; + + /* Note: APIC ID = 0xff is used for broadcast. + * APIC ID > 0xff is reserved. + */ + if (index >= 0xff) + return NULL; + + avic_phy_ait = page_address(vm_data->avic_phy_ait_page); + + return &avic_phy_ait[index]; +} + +struct svm_avic_log_ait_entry * +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat) +{ + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; + int index; + struct svm_avic_log_ait_entry *avic_log_ait; + + if (!vm_data) + return NULL; + + if (is_flat) { /* flat */ + if (mda > 7) + return NULL; + index = mda; + } else { /* cluster */ + int apic_id = mda & 0xf; + int cluster_id = (mda & 0xf0) >> 8; + + if (apic_id > 4 || cluster_id >= 0xf) + return NULL; + index = (cluster_id << 2) + apic_id; + } + avic_log_ait = (struct svm_avic_log_ait_entry *) + page_address(vm_data->avic_log_ait_page); + + return &avic_log_ait[index]; +} + +static inline void +avic_set_bk_page_entry(struct vcpu_svm *svm, int reg_off, u32 val) +{ + void *avic_bk = page_address(svm->avic_bk_page); + + *((u32 *) (avic_bk + reg_off)) = val; +} + +static inline u32 *avic_get_bk_page_entry(struct vcpu_svm *svm, u32 offset) +{ + char *tmp = (char *)page_address(svm->avic_bk_page); + + return (u32 *)(tmp+offset); +} + +static int avic_init_log_apic_entry(struct kvm_vcpu *vcpu, u8 g_phy_apic_id, + u8 log_apic_id) +{ + u32 mod; + struct svm_avic_log_ait_entry *entry; + struct vcpu_svm *svm = to_svm(vcpu); + + if (!svm) + return -EINVAL; + + mod = (*avic_get_bk_page_entry(svm, APIC_DFR) >> 28) & 0xf; + entry = avic_get_log_ait_entry(vcpu, log_apic_id, (mod == 0xf)); + if (!entry) + return -EINVAL; + entry->guest_phy_apic_id = g_phy_apic_id; + entry->valid = 1; + + return 0; +} + +static int avic_init_bk_page(struct kvm_vcpu *vcpu) +{ + u64 addr; + struct page *page; + int id = vcpu->vcpu_id; + struct kvm *kvm = vcpu->kvm; + struct vcpu_svm *svm = to_svm(vcpu); + + addr = APIC_DEFAULT_PHYS_BASE + (id * PAGE_SIZE); + page = gfn_to_page(kvm, addr >> PAGE_SHIFT); + if (is_error_page(page)) + return -EFAULT; + + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + + /* Setting up AVIC Backing Page */ + svm->avic_bk_page = page; + + clear_page(kmap(page)); + pr_debug("%s: vAPIC bk page: cpu=%u, addr=%#llx, pa=%#llx\n", + __func__, id, addr, + (unsigned long long) PFN_PHYS(page_to_pfn(page))); + + avic_init_vmcb(svm); + + return 0; +} + +static inline void avic_unalloc_bk_page(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (svm->avic_bk_page) + kunmap(svm->avic_bk_page); +} + +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id) +{ + int ret = 0, i; + bool realloc = false; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; + struct svm_vm_data *vm_data = kvm->arch.arch_data; + + mutex_lock(&kvm->slots_lock); + + /* Check if we have already allocated vAPIC backing + * page for this vCPU. If not, we need to realloc + * a new one and re-assign all other vCPU. + */ + if (kvm->arch.apic_access_page_done && + (id > vm_data->avic_max_vcpu_id)) { + kvm_for_each_vcpu(i, vcpu, kvm) + avic_unalloc_bk_page(vcpu); + + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + 0, 0); + realloc = true; + vm_data->avic_max_vcpu_id = 0; + } + + /* + * We are allocating vAPIC backing page + * upto the max vCPU ID + */ + if (id >= vm_data->avic_max_vcpu_id) { + ret = __x86_set_memory_region(kvm, + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, + PAGE_SIZE * (id + 1)); + if (ret) + goto out; + + vm_data->avic_max_vcpu_id = id; + } + + /* Reinit vAPIC backing page for exisinting vcpus */ + if (realloc) + kvm_for_each_vcpu(i, vcpu, kvm) + avic_init_bk_page(vcpu); + + avic_init_bk_page(&svm->vcpu); + + kvm->arch.apic_access_page_done = true; + +out: + mutex_unlock(&kvm->slots_lock); + return ret; +} + +static void avic_vm_uninit(struct kvm *kvm) +{ + struct svm_vm_data *vm_data = kvm->arch.arch_data; + + if (!vm_data) + return; + + if (vm_data->avic_log_ait_page) + __free_page(vm_data->avic_log_ait_page); + if (vm_data->avic_phy_ait_page) + __free_page(vm_data->avic_phy_ait_page); + kfree(vm_data); + kvm->arch.arch_data = NULL; +} + +static void avic_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; + + if (!avic) + return; + + avic_unalloc_bk_page(vcpu); + svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs; + svm->in_kernel_lapic_regs = NULL; + + if (vm_data && + (atomic_read(&vm_data->count) == 0 || + atomic_dec_and_test(&vm_data->count))) + avic_vm_uninit(vcpu->kvm); +} + +static atomic_t avic_tag_gen = ATOMIC_INIT(1); + +static inline u32 avic_get_next_tag(void) +{ + u32 tag = atomic_read(&avic_tag_gen); + + atomic_inc(&avic_tag_gen); + return tag; +} + +static int avic_vm_init(struct kvm *kvm) +{ + int err = -ENOMEM; + struct svm_vm_data *vm_data; + struct page *avic_phy_ait_page; + struct page *avic_log_ait_page; + + vm_data = kzalloc(sizeof(struct svm_vm_data), + GFP_KERNEL); + if (!vm_data) + return err; + + kvm->arch.arch_data = vm_data; + atomic_set(&vm_data->count, 0); + + /* Allocating physical APIC ID table (4KB) */ + avic_phy_ait_page = alloc_page(GFP_KERNEL); + if (!avic_phy_ait_page) + goto free_avic; + + vm_data->avic_phy_ait_page = avic_phy_ait_page; + clear_page(page_address(avic_phy_ait_page)); + + /* Allocating logical APIC ID table (4KB) */ + avic_log_ait_page = alloc_page(GFP_KERNEL); + if (!avic_log_ait_page) + goto free_avic; + + vm_data->avic_log_ait_page = avic_log_ait_page; + clear_page(page_address(avic_log_ait_page)); + + vm_data->avic_tag = avic_get_next_tag(); + + return 0; + +free_avic: + avic_vm_uninit(kvm); + return err; +} + +static int avic_vcpu_init(struct kvm *kvm, struct vcpu_svm *svm, int id) +{ + int err; + struct svm_vm_data *vm_data = NULL; + + /* Note: svm_vm_data is per VM */ + if (!kvm->arch.arch_data) { + err = avic_vm_init(kvm); + if (err) + return err; + } + + err = avic_alloc_bk_page(svm, id); + if (err) { + avic_vcpu_uninit(&svm->vcpu); + return err; + } + + vm_data = kvm->arch.arch_data; + atomic_inc(&vm_data->count); + + return 0; } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1131,6 +1510,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); + + if (avic && !init_event) + avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) @@ -1169,6 +1551,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!hsave_page) goto free_page3; + if (avic) { + err = avic_vcpu_init(kvm, svm, id); + if (err) + goto free_page4; + } + svm->nested.hsave = page_address(hsave_page); svm->msrpm = page_address(msrpm_pages); @@ -1187,6 +1575,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; +free_page4: + __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -1209,6 +1599,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + avic_vcpu_uninit(vcpu); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -3382,6 +3773,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); + pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); @@ -3613,11 +4005,31 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) static bool svm_get_enable_apicv(void) { - return false; + return avic; +} + +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ } +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) +{ +} + +/* Note: Currently only used by Hyper-V. */ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + + if (!avic) + return; + + if (!svm->in_kernel_lapic_regs) + return; + + svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs; + vmcb->control.avic_enable = 0; } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -4387,6 +4799,8 @@ static struct kvm_x86_ops svm_x86_ops = { .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, + .hwapic_irr_update = svm_hwapic_irr_update, + .hwapic_isr_update = svm_hwapic_isr_update, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level,
This patch introduces AVIC-related data structure, and AVIC initialization code. There are three main data structures for AVIC: * Virtual APIC (vAPIC) backing page (per-VCPU) * Physical APIC ID table (per-VM) * Logical APIC ID table (per-VM) In order to accommodate the new per-VM tables, we introduce a new per-VM arch-specific void pointer, struct kvm_arch.arch_data. This will point to the newly introduced struct svm_vm_data. Currently, AVIC is disabled by default. Users can manually enable AVIC via kernel boot option kvm-amd.avic=1 or during kvm-amd module loading with parameter avic=1. Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/svm.c | 416 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 417 insertions(+), 1 deletion(-)