diff mbox

[PART1,RFC,v2,05/10] KVM: x86: Detect and Initialize AVIC support

Message ID 1457124368-2025-6-git-send-email-Suravee.Suthikulpanit@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Suthikulpanit, Suravee March 4, 2016, 8:46 p.m. UTC
This patch introduces AVIC-related data structure, and AVIC
initialization code.

There are three main data structures for AVIC:
    * Virtual APIC (vAPIC) backing page (per-VCPU)
    * Physical APIC ID table (per-VM)
    * Logical APIC ID table (per-VM)

In order to accommodate the new per-VM tables, we introduce
a new per-VM arch-specific void pointer, struct kvm_arch.arch_data.
This will point to the newly introduced struct svm_vm_data.

Currently, AVIC is disabled by default. Users can manually
enable AVIC via kernel boot option kvm-amd.avic=1 or during
kvm-amd module loading with parameter avic=1.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/svm.c              | 416 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 417 insertions(+), 1 deletion(-)

Comments

Paolo Bonzini March 7, 2016, 4:41 p.m. UTC | #1
On 04/03/2016 21:46, Suravee Suthikulpanit wrote:

> @@ -162,6 +170,37 @@ struct vcpu_svm {
>  
>  	/* cached guest cpuid flags for faster access */
>  	bool nrips_enabled	: 1;
> +
> +	struct page *avic_bk_page;
> +	void *in_kernel_lapic_regs;
> +};
> +
> +struct __attribute__ ((__packed__))
> +svm_avic_log_ait_entry {
> +	u32 guest_phy_apic_id	: 8,
> +	    res			: 23,
> +	    valid		: 1;
> +};
> +
> +struct __attribute__ ((__packed__))
> +svm_avic_phy_ait_entry {
> +	u64 host_phy_apic_id	: 8,
> +	    res1		: 4,
> +	    bk_pg_ptr		: 40,
> +	    res2		: 10,
> +	    is_running		: 1,
> +	    valid		: 1;
> +};

Please don't use bitfields.

> +/* Note: This structure is per VM */
> +struct svm_vm_data {
> +	atomic_t count;
> +	u32 ldr_mode;
> +	u32 avic_max_vcpu_id;
> +	u32 avic_tag;
> +
> +	struct page *avic_log_ait_page;
> +	struct page *avic_phy_ait_page;

You can put these directly in kvm_arch.  Do not use abbreviations:

	struct page *avic_logical_apic_id_table_page;
	struct page *avic_physical_apic_id_table_page;

> @@ -1000,6 +1057,41 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
>  	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
>  }
>  
> +static void avic_init_vmcb(struct vcpu_svm *svm)
> +{
> +	void *vapic_bkpg;
> +	struct vmcb *vmcb = svm->vmcb;
> +	struct svm_vm_data *vm_data = svm->vcpu.kvm->arch.arch_data;
> +	phys_addr_t bpa = PFN_PHYS(page_to_pfn(svm->avic_bk_page));
> +	phys_addr_t lpa = PFN_PHYS(page_to_pfn(vm_data->avic_log_ait_page));
> +	phys_addr_t ppa = PFN_PHYS(page_to_pfn(vm_data->avic_phy_ait_page));

Please use page_to_phys.

> +	if (!vmcb)
> +		return;
> +
> +	pr_debug("%s: bpa=%#llx, lpa=%#llx, ppa=%#llx\n",
> +		 __func__, (unsigned long long)bpa,
> +		 (unsigned long long)lpa, (unsigned long long)ppa);

No pr_debug.

> +
> +	/*
> +	 * Here we copy the value from the emulated LAPIC register
> +	 * to the vAPIC backign page, and then flip regs frame.
> +	 */
> +	if (!svm->in_kernel_lapic_regs)
> +		svm->in_kernel_lapic_regs = svm->vcpu.arch.apic->regs;
> +
> +	vapic_bkpg = pfn_to_kaddr(page_to_pfn(svm->avic_bk_page));

Please use kmap instead.

> +	memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE);
> +	svm->vcpu.arch.apic->regs = vapic_bkpg;

Can you explain the flipping logic, and why you cannot just use the
existing apic.regs?

> +	vmcb->control.avic_bk_page = bpa & AVIC_HPA_MASK;

No abbreviations ("avic_backing_page").

> +	vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK;
> +	vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK;
> +	vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX;
> +	vmcb->control.avic_enable = 1;
> +}
> +
>  static void init_vmcb(struct vcpu_svm *svm)
>  {
>  	struct vmcb_control_area *control = &svm->vmcb->control;
> @@ -1113,6 +1205,293 @@ static void init_vmcb(struct vcpu_svm *svm)
>  	mark_all_dirty(svm->vmcb);
>  
>  	enable_gif(svm);
> +
> +	if (avic)
> +		avic_init_vmcb(svm);
> +}
> +
> +static struct svm_avic_phy_ait_entry *
> +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index)
> +{
> +	struct svm_avic_phy_ait_entry *avic_phy_ait;
> +	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> +
> +	if (!vm_data)
> +		return NULL;
> +
> +	/* Note: APIC ID = 0xff is used for broadcast.
> +	 *       APIC ID > 0xff is reserved.
> +	 */
> +	if (index >= 0xff)
> +		return NULL;
> +
> +	avic_phy_ait = page_address(vm_data->avic_phy_ait_page);
> +
> +	return &avic_phy_ait[index];
> +}
> +
> +struct svm_avic_log_ait_entry *
> +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat)
> +{
> +	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> +	int index;
> +	struct svm_avic_log_ait_entry *avic_log_ait;
> +
> +	if (!vm_data)
> +		return NULL;
> +
> +	if (is_flat) { /* flat */
> +		if (mda > 7)
> +			return NULL;
> +		index = mda;
> +	} else { /* cluster */
> +		int apic_id = mda & 0xf;
> +		int cluster_id = (mda & 0xf0) >> 8;
> +
> +		if (apic_id > 4 || cluster_id >= 0xf)
> +			return NULL;
> +		index = (cluster_id << 2) + apic_id;
> +	}
> +	avic_log_ait = (struct svm_avic_log_ait_entry *)
> +				page_address(vm_data->avic_log_ait_page);
> +
> +	return &avic_log_ait[index];
> +}

Instead of these functions, create a complete function to handle APIC_ID
and APIC_LDR writes.  Then use kmap/kunmap instead of page_address.

> +static inline void
> +avic_set_bk_page_entry(struct vcpu_svm *svm, int reg_off, u32 val)
> +{
> +	void *avic_bk = page_address(svm->avic_bk_page);
> +
> +	*((u32 *) (avic_bk + reg_off)) = val;
> +}

Unused function.

> +static inline u32 *avic_get_bk_page_entry(struct vcpu_svm *svm, u32 offset)
> +{
> +	char *tmp = (char *)page_address(svm->avic_bk_page);
> +
> +	return (u32 *)(tmp+offset);
> +}

I think you should be able to use kvm_apic_get_reg instead.

> +static int avic_init_bk_page(struct kvm_vcpu *vcpu)

No abbreviations (but again, please try reusing apic.regs).

> +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id)
> +{
> +	int ret = 0, i;
> +	bool realloc = false;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm *kvm = svm->vcpu.kvm;
> +	struct svm_vm_data *vm_data = kvm->arch.arch_data;
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	/* Check if we have already allocated vAPIC backing
> +	 * page for this vCPU. If not, we need to realloc
> +	 * a new one and re-assign all other vCPU.
> +	 */
> +	if (kvm->arch.apic_access_page_done &&
> +	    (id > vm_data->avic_max_vcpu_id)) {
> +		kvm_for_each_vcpu(i, vcpu, kvm)
> +			avic_unalloc_bk_page(vcpu);
> +
> +		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> +					0, 0);
> +		realloc = true;
> +		vm_data->avic_max_vcpu_id = 0;
> +	}
> +
> +	/*
> +	 * We are allocating vAPIC backing page
> +	 * upto the max vCPU ID
> +	 */
> +	if (id >= vm_data->avic_max_vcpu_id) {
> +		ret = __x86_set_memory_region(kvm,
> +					      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> +					      APIC_DEFAULT_PHYS_BASE,
> +					      PAGE_SIZE * (id + 1));

Why is this necessary?  The APIC access page is a peculiarity of Intel
processors (and the special memslot for only needs to map 0xfee00000 to
0xfee00fff; after that there is the MSI area).

> +		if (ret)
> +			goto out;
> +
> +		vm_data->avic_max_vcpu_id = id;
> +	}
> +
> +	/* Reinit vAPIC backing page for exisinting vcpus */
> +	if (realloc)
> +		kvm_for_each_vcpu(i, vcpu, kvm)
> +			avic_init_bk_page(vcpu);

Why is this necessary?

> +	avic_init_bk_page(&svm->vcpu);
> +
> +	kvm->arch.apic_access_page_done = true;
> +
> +out:
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +static void avic_vm_uninit(struct kvm *kvm)
> +{
> +	struct svm_vm_data *vm_data = kvm->arch.arch_data;
> +
> +	if (!vm_data)
> +		return;
> +
> +	if (vm_data->avic_log_ait_page)
> +		__free_page(vm_data->avic_log_ait_page);
> +	if (vm_data->avic_phy_ait_page)
> +		__free_page(vm_data->avic_phy_ait_page);
> +	kfree(vm_data);
> +	kvm->arch.arch_data = NULL;
> +}
> +
> +static void avic_vcpu_uninit(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> +
> +	if (!avic)
> +		return;
> +
> +	avic_unalloc_bk_page(vcpu);
> +	svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
> +	svm->in_kernel_lapic_regs = NULL;
> +
> +	if (vm_data &&
> +	    (atomic_read(&vm_data->count) == 0 ||
> +	     atomic_dec_and_test(&vm_data->count)))
> +		avic_vm_uninit(vcpu->kvm);

Add a new kvm_x86_ops callback .free_vcpus and call it from
kvm_free_vcpus; then count is not necessary.

We probably should use it for Intel too.  The calls to
x86_set_memory_region in kvm_arch_destroy_vm are hideous.

> +}
> +
> +static atomic_t avic_tag_gen = ATOMIC_INIT(1);
> +
> +static inline u32 avic_get_next_tag(void)
> +{
> +	u32 tag = atomic_read(&avic_tag_gen);
> +
> +	atomic_inc(&avic_tag_gen);
> +	return tag;
> +}
> +
> +static int avic_vm_init(struct kvm *kvm)
> +{
> +	int err = -ENOMEM;
> +	struct svm_vm_data *vm_data;
> +	struct page *avic_phy_ait_page;
> +	struct page *avic_log_ait_page;

This is probably also best moved to a new callback .init_vm (called from
kvm_arch_init_vm).

> +	vm_data = kzalloc(sizeof(struct svm_vm_data),
> +				      GFP_KERNEL);
> +	if (!vm_data)
> +		return err;
> +
> +	kvm->arch.arch_data = vm_data;
> +	atomic_set(&vm_data->count, 0);
> +
> +	/* Allocating physical APIC ID table (4KB) */
> +	avic_phy_ait_page = alloc_page(GFP_KERNEL);
> +	if (!avic_phy_ait_page)
> +		goto free_avic;
> +
> +	vm_data->avic_phy_ait_page = avic_phy_ait_page;
> +	clear_page(page_address(avic_phy_ait_page));

You can use get_zeroed_page and free_page, instead of
alloc_page/clear_page/__free_page.

Thanks,

Paolo

> +	/* Allocating logical APIC ID table (4KB) */
> +	avic_log_ait_page = alloc_page(GFP_KERNEL);
> +	if (!avic_log_ait_page)
> +		goto free_avic;
> +
> +	vm_data->avic_log_ait_page = avic_log_ait_page;
> +	clear_page(page_address(avic_log_ait_page));
> +
> +	vm_data->avic_tag = avic_get_next_tag();
> +
> +	return 0;
> +
> +free_avic:
> +	avic_vm_uninit(kvm);
> +	return err;
> +}
> +
> +static int avic_vcpu_init(struct kvm *kvm, struct vcpu_svm *svm, int id)
> +{
> +	int err;
> +	struct svm_vm_data *vm_data = NULL;
> +
> +	/* Note: svm_vm_data is per VM */
> +	if (!kvm->arch.arch_data) {
> +		err = avic_vm_init(kvm);
> +		if (err)
> +			return err;
> +	}
> +
> +	err = avic_alloc_bk_page(svm, id);
> +	if (err) {
> +		avic_vcpu_uninit(&svm->vcpu);
> +		return err;
> +	}
> +
> +	vm_data = kvm->arch.arch_data;
> +	atomic_inc(&vm_data->count);
> +
> +	return 0;
>  }
>  
>  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> @@ -1131,6 +1510,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  
>  	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
>  	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
> +
> +	if (avic && !init_event)
> +		avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
>  }
>  
>  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
> @@ -1169,6 +1551,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
>  	if (!hsave_page)
>  		goto free_page3;
>  
> +	if (avic) {
> +		err = avic_vcpu_init(kvm, svm, id);
> +		if (err)
> +			goto free_page4;
> +	}
> +
>  	svm->nested.hsave = page_address(hsave_page);
>  
>  	svm->msrpm = page_address(msrpm_pages);
> @@ -1187,6 +1575,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
>  
>  	return &svm->vcpu;
>  
> +free_page4:
> +	__free_page(hsave_page);
>  free_page3:
>  	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
>  free_page2:
> @@ -1209,6 +1599,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
>  	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
>  	__free_page(virt_to_page(svm->nested.hsave));
>  	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
> +	avic_vcpu_uninit(vcpu);
>  	kvm_vcpu_uninit(vcpu);
>  	kmem_cache_free(kvm_vcpu_cache, svm);
>  }
> @@ -3382,6 +3773,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
>  	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
>  	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
>  	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
> +	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
>  	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
>  	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
>  	pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
> @@ -3613,11 +4005,31 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
>  
>  static bool svm_get_enable_apicv(void)
>  {
> -	return false;
> +	return avic;
> +}
> +
> +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
> +{
>  }
>  
> +static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
> +{
> +}
> +
> +/* Note: Currently only used by Hyper-V. */
>  static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
>  {
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +	struct vmcb *vmcb = svm->vmcb;
> +
> +	if (!avic)
> +		return;
> +
> +	if (!svm->in_kernel_lapic_regs)
> +		return;
> +
> +	svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
> +	vmcb->control.avic_enable = 0;
>  }
>  
>  static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> @@ -4387,6 +4799,8 @@ static struct kvm_x86_ops svm_x86_ops = {
>  	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
>  	.load_eoi_exitmap = svm_load_eoi_exitmap,
>  	.sync_pir_to_irr = svm_sync_pir_to_irr,
> +	.hwapic_irr_update = svm_hwapic_irr_update,
> +	.hwapic_isr_update = svm_hwapic_isr_update,
>  
>  	.set_tss_addr = svm_set_tss_addr,
>  	.get_tdp_level = get_npt_level,
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suthikulpanit, Suravee March 15, 2016, 5:09 p.m. UTC | #2
Hi

On 03/07/2016 11:41 PM, Paolo Bonzini wrote:
> On 04/03/2016 21:46, Suravee Suthikulpanit wrote:
> >  [....]
>> +/* Note: This structure is per VM */
>> +struct svm_vm_data {
>> +	atomic_t count;
>> +	u32 ldr_mode;
>> +	u32 avic_max_vcpu_id;
>> +	u32 avic_tag;
>> +
>> +	struct page *avic_log_ait_page;
>> +	struct page *avic_phy_ait_page;
>
> You can put these directly in kvm_arch.  Do not use abbreviations:
>
> 	struct page *avic_logical_apic_id_table_page;
> 	struct page *avic_physical_apic_id_table_page;
>

Actually, the reason I would like to introduce this per-arch specific 
structure is because I feel that it is easier to manage these 
processor-specific variable/data-structure. If we add all these directly 
into kvm_arch, which is shared b/w SVM and VMX, it is more difficult to 
tell which one is used in the different code base.

>> [...]
>> +	memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE);
>> +	svm->vcpu.arch.apic->regs = vapic_bkpg;
>
> Can you explain the flipping logic, and why you cannot just use the
> existing apic.regs?

Please see "explanation 1" below.

>> [...]
>> +static struct svm_avic_phy_ait_entry *
>> +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index)
>> +{
>> +    [.....]
>> +}
>> +
>> +struct svm_avic_log_ait_entry *
>> +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat)
>> +{
>> +	[.....]
>> +}
>
> Instead of these functions, create a complete function to handle APIC_ID
> and APIC_LDR writes.  Then use kmap/kunmap instead of page_address.
>

Ok. May I ask why we are against using page_address?  I have see that 
used in several places in the code.

>> [...]
>> +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id)
>> +{
>> +	int ret = 0, i;
>> +	bool realloc = false;
>> +	struct kvm_vcpu *vcpu;
>> +	struct kvm *kvm = svm->vcpu.kvm;
>> +	struct svm_vm_data *vm_data = kvm->arch.arch_data;
>> +
>> +	mutex_lock(&kvm->slots_lock);
>> +
>> +	/* Check if we have already allocated vAPIC backing
>> +	 * page for this vCPU. If not, we need to realloc
>> +	 * a new one and re-assign all other vCPU.
>> +	 */
>> +	if (kvm->arch.apic_access_page_done &&
>> +	    (id > vm_data->avic_max_vcpu_id)) {
>> +		kvm_for_each_vcpu(i, vcpu, kvm)
>> +			avic_unalloc_bk_page(vcpu);
>> +
>> +		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
>> +					0, 0);
>> +		realloc = true;
>> +		vm_data->avic_max_vcpu_id = 0;
>> +	}
>> +
>> +	/*
>> +	 * We are allocating vAPIC backing page
>> +	 * upto the max vCPU ID
>> +	 */
>> +	if (id >= vm_data->avic_max_vcpu_id) {
>> +		ret = __x86_set_memory_region(kvm,
>> +					      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
>> +					      APIC_DEFAULT_PHYS_BASE,
>> +					      PAGE_SIZE * (id + 1));
>
> Why is this necessary?  The APIC access page is a peculiarity of Intel
> processors (and the special memslot for only needs to map 0xfee00000 to
> 0xfee00fff; after that there is the MSI area).
>

Please see "explanation 1" below.

 >> [...]
>> +		if (ret)
>> +			goto out;
>> +
>> +		vm_data->avic_max_vcpu_id = id;
>> +	}
>> +
>> +	/* Reinit vAPIC backing page for exisinting vcpus */
>> +	if (realloc)
>> +		kvm_for_each_vcpu(i, vcpu, kvm)
>> +			avic_init_bk_page(vcpu);
>
> Why is this necessary?

Explanation 1:

The current lapic regs page is allocated using get_zeroed_page(), which 
can be paged out. If I use these pages for AVIC backing pages, it seems 
to cause VM to slow down quite a bit due to a lot of page faults.

Currently, the AVIC backing pages are acquired from __x86_set_memory 
region() with APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, which maps the pages for 
address 0xfee00000 and above for VM to use. I mostly grab this from the 
VMX implementation in alloc_apic_access_page().

However, the memslot requires specification of the size at the time when 
calling __x86_set_memory_region(). However, I can't seem to figure out 
where I can get the number of vcpus at the time when we creating VM. 
Therefore, I have to track the vcpu creation, and re-acquire larger 
memslot every time vcpu_create() is called.

I was not sure if this is the right approach, any suggestion for this part.

Thanks,
Suravee
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini March 15, 2016, 5:22 p.m. UTC | #3
On 15/03/2016 18:09, Suravee Suthikulpanit wrote:
> Hi
> 
> On 03/07/2016 11:41 PM, Paolo Bonzini wrote:
>> On 04/03/2016 21:46, Suravee Suthikulpanit wrote:
>> >  [....]
>>> +/* Note: This structure is per VM */
>>> +struct svm_vm_data {
>>> +    atomic_t count;
>>> +    u32 ldr_mode;
>>> +    u32 avic_max_vcpu_id;
>>> +    u32 avic_tag;
>>> +
>>> +    struct page *avic_log_ait_page;
>>> +    struct page *avic_phy_ait_page;
>>
>> You can put these directly in kvm_arch.  Do not use abbreviations:
>>
>>     struct page *avic_logical_apic_id_table_page;
>>     struct page *avic_physical_apic_id_table_page;
>>
> 
> Actually, the reason I would like to introduce this per-arch specific
> structure is because I feel that it is easier to manage these
> processor-specific variable/data-structure. If we add all these directly
> into kvm_arch, which is shared b/w SVM and VMX, it is more difficult to
> tell which one is used in the different code base.

You're right, but adding a pointer makes things slower and larger.
Using an anonymous union would work.  For now, I prefer to have the
fields directly in kvm_arch.

>>> [...]
>>> +static struct svm_avic_phy_ait_entry *
>>> +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index)
>>> +{
>>> +    [.....]
>>> +}
>>> +
>>> +struct svm_avic_log_ait_entry *
>>> +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat)
>>> +{
>>> +    [.....]
>>> +}
>>
>> Instead of these functions, create a complete function to handle APIC_ID
>> and APIC_LDR writes.  Then use kmap/kunmap instead of page_address.
>>
> 
> Ok. May I ask why we are against using page_address?  I have see that
> used in several places in the code.

You're right, I guess page_address is okay for pages that were allocated
with alloc_page().

>> Why is this necessary?  The APIC access page is a peculiarity of Intel
>> processors (and the special memslot for only needs to map 0xfee00000 to
>> 0xfee00fff; after that there is the MSI area).
> 
> The current lapic regs page is allocated using get_zeroed_page(), which
> can be paged out. If I use these pages for AVIC backing pages, it seems
> to cause VM to slow down quite a bit due to a lot of page faults.

What causes the lapic regs page to be paged out?

> Currently, the AVIC backing pages are acquired from __x86_set_memory
> region() with APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, which maps the pages for
> address 0xfee00000 and above for VM to use. I mostly grab this from the
> VMX implementation in alloc_apic_access_page().
> 
> However, the memslot requires specification of the size at the time when
> calling __x86_set_memory_region(). However, I can't seem to figure out
> where I can get the number of vcpus at the time when we creating VM.
> Therefore, I have to track the vcpu creation, and re-acquire larger
> memslot every time vcpu_create() is called.

The purpose of the APIC_ACCESS_PAGE_PRIVATE_MEMSLOT is very specific:
it is there to provide a mapping for 0xfee00000 because Intel processors
trap writes between 0xfee00000 and 0xfee00fff, but otherwise ignore the
contents of the page you map there.  Intel processors only need
something to compare the physical address with, they don't care about
the data in the page, so they have one page per VM (they could really
use a single page in the whole system---one per VM is just how KVM works
right now).  It is a peculiar design, and one that you should probably
ignore in your AVIC patches.

The AVIC backing page is more similar to Intel's "virtual-APIC page".
You can see that vmx.c just uses lapic->regs for it.

        if (cpu_has_vmx_tpr_shadow() && !init_event) {
                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
                if (cpu_need_tpr_shadow(vcpu))
                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
                                     __pa(vcpu->arch.apic->regs));
                vmcs_write32(TPR_THRESHOLD, 0);
        }

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suthikulpanit, Suravee March 16, 2016, 6:22 a.m. UTC | #4
Hi,

On 03/16/2016 12:22 AM, Paolo Bonzini wrote:
>>> Why is this necessary?  The APIC access page is a peculiarity of Intel
>>> >>processors (and the special memslot for only needs to map 0xfee00000 to
>>> >>0xfee00fff; after that there is the MSI area).
>> >
>> >The current lapic regs page is allocated using get_zeroed_page(), which
>> >can be paged out. If I use these pages for AVIC backing pages, it seems
>> >to cause VM to slow down quite a bit due to a lot of page faults.
> What causes the lapic regs page to be paged out?
>

This is mainly causing a large number of VMEXIT due to NPF.  In my test 
running hackbench in the guest. The following are perf result profiling 
for 10 seconds in the host in two cases:

CASE1: Using x86_set_memory_region() for AVIC backing page

# ./perf-vmexit.sh 10
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 2.813 MB perf.data.guest (30356 samples) ]


Analyze events for all VMs, all VCPUs:

              VM-EXIT    Samples  Samples%     Time%    Min Time    Max 
Time         Avg time

            interrupt      10042    66.30%    81.33%      0.43us 
202.50us      7.43us ( +-   1.20% )
                  msr       5004    33.04%    15.76%      0.73us 
12.21us      2.89us ( +-   0.43% )
                pause         58     0.38%     0.18%      0.56us 
5.88us      2.92us ( +-   6.43% )
                  npf         35     0.23%     2.01%      6.41us 
207.78us     52.70us ( +-  23.67% )
                  nmi          4     0.03%     0.02%      2.31us 
4.67us      3.49us ( +-  14.26% )
                   io          3     0.02%     0.70%     82.75us 
360.90us    214.28us ( +-  37.64% )
      avic_incomp_ipi          1     0.01%     0.00%      2.17us 
2.17us      2.17us ( +-   0.00% )

Total Samples:15147, Total events handled time:91715.78us.


CASE2: Using the lapic regs page for AVIC backing page.

# ./perf-vmexit.sh 10
[ perf record: Woken up 255 times to write data ]
[ perf record: Captured and wrote 509.202 MB perf.data.guest (5718856 
samples) ]


Analyze events for all VMs, all VCPUs:

              VM-EXIT    Samples  Samples%     Time%    Min Time    Max 
Time         Avg time

                  npf    1897710    99.33%    98.08%      1.09us 
243.22us      1.67us ( +-   0.04% )
            interrupt       7818     0.41%     1.44%      0.44us 
216.55us      5.97us ( +-   1.92% )
                  msr       5001     0.26%     0.45%      0.68us 
12.58us      2.89us ( +-   0.50% )
                pause         25     0.00%     0.00%      0.71us 
4.23us      2.03us ( +-  10.76% )
                   io          4     0.00%     0.03%     73.91us 
337.29us    206.74us ( +-  26.38% )
                  nmi          1     0.00%     0.00%      5.92us 
5.92us      5.92us ( +-   0.00% )

Total Samples:1910559, Total events handled time:3229214.64us.

Thanks,
Suravee
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini March 16, 2016, 7:20 a.m. UTC | #5
On 16/03/2016 07:22, Suravee Suthikulpanit wrote:
> This is mainly causing a large number of VMEXIT due to NPF.

Got it, it's here in the manual: "System software is responsible for
setting up a translation in the nested page table granting guest read
and write permissions for accesses to the vAPIC Backing Page in SPA
space. AVIC hardware walks the nested page table to check permissions,
but does not use the SPA address specified in the leaf page table entry.
Instead, AVIC hardware finds this address in the AVIC_BACKING_PAGE
pointer field of the VMCB".

Strictly speaking the address of the 0xFEE00000 translation is
unnecessary and it could be all zeroes, but I suggest that you set up an
APIC access page like Intel does (4k only), using the special memslot.
The AVIC backing page can then point to lapic->regs.

Thanks for the explanation!

Paolo

> CASE1: Using x86_set_memory_region() for AVIC backing page
> 
> # ./perf-vmexit.sh 10
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 2.813 MB perf.data.guest (30356
> samples) ]
> 
> 
> Analyze events for all VMs, all VCPUs:
> 
>              VM-EXIT    Samples  Samples%     Time%    Min Time    Max
> Time         Avg time
> 
>            interrupt      10042    66.30%    81.33%      0.43us
> 202.50us      7.43us ( +-   1.20% )
>                  msr       5004    33.04%    15.76%      0.73us
> 12.21us      2.89us ( +-   0.43% )
>                pause         58     0.38%     0.18%      0.56us
> 5.88us      2.92us ( +-   6.43% )
>                  npf         35     0.23%     2.01%      6.41us
> 207.78us     52.70us ( +-  23.67% )
>                  nmi          4     0.03%     0.02%      2.31us
> 4.67us      3.49us ( +-  14.26% )
>                   io          3     0.02%     0.70%     82.75us
> 360.90us    214.28us ( +-  37.64% )
>      avic_incomp_ipi          1     0.01%     0.00%      2.17us
> 2.17us      2.17us ( +-   0.00% )
> 
> Total Samples:15147, Total events handled time:91715.78us.
> 
> 
> CASE2: Using the lapic regs page for AVIC backing page.
> 
> # ./perf-vmexit.sh 10
> [ perf record: Woken up 255 times to write data ]
> [ perf record: Captured and wrote 509.202 MB perf.data.guest (5718856
> samples) ]
> 
> 
> Analyze events for all VMs, all VCPUs:
> 
>              VM-EXIT    Samples  Samples%     Time%    Min Time    Max
> Time         Avg time
> 
>                  npf    1897710    99.33%    98.08%      1.09us
> 243.22us      1.67us ( +-   0.04% )
>            interrupt       7818     0.41%     1.44%      0.44us
> 216.55us      5.97us ( +-   1.92% )
>                  msr       5001     0.26%     0.45%      0.68us
> 12.58us      2.89us ( +-   0.50% )
>                pause         25     0.00%     0.00%      0.71us
> 4.23us      2.03us ( +-  10.76% )
>                   io          4     0.00%     0.03%     73.91us
> 337.29us    206.74us ( +-  26.38% )
>                  nmi          1     0.00%     0.00%      5.92us
> 5.92us      5.92us ( +-   0.00% )
> 
> Total Samples:1910559, Total events handled time:3229214.64us.
> 
> Thanks,
> Suravee
> -- 
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Suthikulpanit, Suravee March 16, 2016, 8:21 a.m. UTC | #6
Hi,

On 03/16/2016 02:20 PM, Paolo Bonzini wrote:
>
> On 16/03/2016 07:22, Suravee Suthikulpanit wrote:
>> >This is mainly causing a large number of VMEXIT due to NPF.
> Got it, it's here in the manual: "System software is responsible for
> setting up a translation in the nested page table granting guest read
> and write permissions for accesses to the vAPIC Backing Page in SPA
> space. AVIC hardware walks the nested page table to check permissions,
> but does not use the SPA address specified in the leaf page table entry.
> Instead, AVIC hardware finds this address in the AVIC_BACKING_PAGE
> pointer field of the VMCB".
>
> Strictly speaking the address of the 0xFEE00000 translation is
> unnecessary and it could be all zeroes, but I suggest that you set up an
> APIC access page like Intel does (4k only), using the special memslot.
> The AVIC backing page can then point to lapic->regs.
>
> Thanks for the explanation!
>
> Paolo
>

Ahh... you are right, this works also. Thanks for the pointer. I'm 
fixing this, doing some more testing, and cleaning up the code. This has 
simplify the init logic quite a bit.

Thanks for suggestion,
Suravee
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini March 16, 2016, 11:12 a.m. UTC | #7
On 16/03/2016 09:21, Suravee Suthikulpanit wrote:
>> Strictly speaking the address of the 0xFEE00000 translation is
>> unnecessary and it could be all zeroes, but I suggest that you set up an
>> APIC access page like Intel does (4k only), using the special memslot.
>> The AVIC backing page can then point to lapic->regs.
> 
> Ahh... you are right, this works also. Thanks for the pointer. I'm
> fixing this, doing some more testing, and cleaning up the code. This has
> simplify the init logic quite a bit.

Awesome, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9a61669..15f6cd8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -754,6 +754,8 @@  struct kvm_arch {
 
 	bool irqchip_split;
 	u8 nr_reserved_ioapic_pins;
+
+	void *arch_data;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6ab66d0..c703149 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -14,6 +14,9 @@ 
  * the COPYING file in the top-level directory.
  *
  */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
 #include <linux/kvm_host.h>
 
 #include "irq.h"
@@ -78,6 +81,11 @@  MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define TSC_RATIO_MIN		0x0000000000000001ULL
 #define TSC_RATIO_MAX		0x000000ffffffffffULL
 
+#define AVIC_HPA_MASK	~((0xFFFULL << 52) || 0xFFF)
+
+/* NOTE: Current max index allowed for physical APIC ID table is 255 */
+#define AVIC_PHY_APIC_ID_MAX	0xFF
+
 static bool erratum_383_found __read_mostly;
 
 static const u32 host_save_user_msrs[] = {
@@ -162,6 +170,37 @@  struct vcpu_svm {
 
 	/* cached guest cpuid flags for faster access */
 	bool nrips_enabled	: 1;
+
+	struct page *avic_bk_page;
+	void *in_kernel_lapic_regs;
+};
+
+struct __attribute__ ((__packed__))
+svm_avic_log_ait_entry {
+	u32 guest_phy_apic_id	: 8,
+	    res			: 23,
+	    valid		: 1;
+};
+
+struct __attribute__ ((__packed__))
+svm_avic_phy_ait_entry {
+	u64 host_phy_apic_id	: 8,
+	    res1		: 4,
+	    bk_pg_ptr		: 40,
+	    res2		: 10,
+	    is_running		: 1,
+	    valid		: 1;
+};
+
+/* Note: This structure is per VM */
+struct svm_vm_data {
+	atomic_t count;
+	u32 ldr_mode;
+	u32 avic_max_vcpu_id;
+	u32 avic_tag;
+
+	struct page *avic_log_ait_page;
+	struct page *avic_phy_ait_page;
 };
 
 static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -205,6 +244,10 @@  module_param(npt, int, S_IRUGO);
 static int nested = true;
 module_param(nested, int, S_IRUGO);
 
+/* enable / disable AVIC */
+static int avic;
+module_param(avic, int, S_IRUGO);
+
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -234,6 +277,13 @@  enum {
 /* TPR and CR2 are always written before VMRUN */
 #define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
 
+#define VMCB_AVIC_APIC_BAR_MASK		0xFFFFFFFFFF000ULL
+
+static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+{
+	svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+}
+
 static inline void mark_all_dirty(struct vmcb *vmcb)
 {
 	vmcb->control.clean = 0;
@@ -923,6 +973,13 @@  static __init int svm_hardware_setup(void)
 	} else
 		kvm_disable_tdp();
 
+	if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)))
+		avic = false;
+
+	if (avic) {
+		printk(KERN_INFO "kvm: AVIC enabled\n");
+	}
+
 	return 0;
 
 err:
@@ -1000,6 +1057,41 @@  static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
+static void avic_init_vmcb(struct vcpu_svm *svm)
+{
+	void *vapic_bkpg;
+	struct vmcb *vmcb = svm->vmcb;
+	struct svm_vm_data *vm_data = svm->vcpu.kvm->arch.arch_data;
+	phys_addr_t bpa = PFN_PHYS(page_to_pfn(svm->avic_bk_page));
+	phys_addr_t lpa = PFN_PHYS(page_to_pfn(vm_data->avic_log_ait_page));
+	phys_addr_t ppa = PFN_PHYS(page_to_pfn(vm_data->avic_phy_ait_page));
+
+	if (!vmcb)
+		return;
+
+	pr_debug("%s: bpa=%#llx, lpa=%#llx, ppa=%#llx\n",
+		 __func__, (unsigned long long)bpa,
+		 (unsigned long long)lpa, (unsigned long long)ppa);
+
+
+	/*
+	 * Here we copy the value from the emulated LAPIC register
+	 * to the vAPIC backign page, and then flip regs frame.
+	 */
+	if (!svm->in_kernel_lapic_regs)
+		svm->in_kernel_lapic_regs = svm->vcpu.arch.apic->regs;
+
+	vapic_bkpg = pfn_to_kaddr(page_to_pfn(svm->avic_bk_page));
+	memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE);
+	svm->vcpu.arch.apic->regs = vapic_bkpg;
+
+	vmcb->control.avic_bk_page = bpa & AVIC_HPA_MASK;
+	vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK;
+	vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK;
+	vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX;
+	vmcb->control.avic_enable = 1;
+}
+
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -1113,6 +1205,293 @@  static void init_vmcb(struct vcpu_svm *svm)
 	mark_all_dirty(svm->vmcb);
 
 	enable_gif(svm);
+
+	if (avic)
+		avic_init_vmcb(svm);
+}
+
+static struct svm_avic_phy_ait_entry *
+avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index)
+{
+	struct svm_avic_phy_ait_entry *avic_phy_ait;
+	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
+
+	if (!vm_data)
+		return NULL;
+
+	/* Note: APIC ID = 0xff is used for broadcast.
+	 *       APIC ID > 0xff is reserved.
+	 */
+	if (index >= 0xff)
+		return NULL;
+
+	avic_phy_ait = page_address(vm_data->avic_phy_ait_page);
+
+	return &avic_phy_ait[index];
+}
+
+struct svm_avic_log_ait_entry *
+avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat)
+{
+	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
+	int index;
+	struct svm_avic_log_ait_entry *avic_log_ait;
+
+	if (!vm_data)
+		return NULL;
+
+	if (is_flat) { /* flat */
+		if (mda > 7)
+			return NULL;
+		index = mda;
+	} else { /* cluster */
+		int apic_id = mda & 0xf;
+		int cluster_id = (mda & 0xf0) >> 8;
+
+		if (apic_id > 4 || cluster_id >= 0xf)
+			return NULL;
+		index = (cluster_id << 2) + apic_id;
+	}
+	avic_log_ait = (struct svm_avic_log_ait_entry *)
+				page_address(vm_data->avic_log_ait_page);
+
+	return &avic_log_ait[index];
+}
+
+static inline void
+avic_set_bk_page_entry(struct vcpu_svm *svm, int reg_off, u32 val)
+{
+	void *avic_bk = page_address(svm->avic_bk_page);
+
+	*((u32 *) (avic_bk + reg_off)) = val;
+}
+
+static inline u32 *avic_get_bk_page_entry(struct vcpu_svm *svm, u32 offset)
+{
+	char *tmp = (char *)page_address(svm->avic_bk_page);
+
+	return (u32 *)(tmp+offset);
+}
+
+static int avic_init_log_apic_entry(struct kvm_vcpu *vcpu, u8 g_phy_apic_id,
+				    u8 log_apic_id)
+{
+	u32 mod;
+	struct svm_avic_log_ait_entry *entry;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (!svm)
+		return -EINVAL;
+
+	mod = (*avic_get_bk_page_entry(svm, APIC_DFR) >> 28) & 0xf;
+	entry = avic_get_log_ait_entry(vcpu, log_apic_id, (mod == 0xf));
+	if (!entry)
+		return -EINVAL;
+	entry->guest_phy_apic_id = g_phy_apic_id;
+	entry->valid = 1;
+
+	return 0;
+}
+
+static int avic_init_bk_page(struct kvm_vcpu *vcpu)
+{
+	u64 addr;
+	struct page *page;
+	int id = vcpu->vcpu_id;
+	struct kvm *kvm = vcpu->kvm;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	addr = APIC_DEFAULT_PHYS_BASE + (id * PAGE_SIZE);
+	page = gfn_to_page(kvm, addr >> PAGE_SHIFT);
+	if (is_error_page(page))
+		return -EFAULT;
+
+	/*
+	 * Do not pin the page in memory, so that memory hot-unplug
+	 * is able to migrate it.
+	 */
+	put_page(page);
+
+	/* Setting up AVIC Backing Page */
+	svm->avic_bk_page = page;
+
+	clear_page(kmap(page));
+	pr_debug("%s: vAPIC bk page: cpu=%u, addr=%#llx, pa=%#llx\n",
+		 __func__, id, addr,
+		 (unsigned long long) PFN_PHYS(page_to_pfn(page)));
+
+	avic_init_vmcb(svm);
+
+	return 0;
+}
+
+static inline void avic_unalloc_bk_page(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (svm->avic_bk_page)
+		kunmap(svm->avic_bk_page);
+}
+
+static int avic_alloc_bk_page(struct vcpu_svm *svm, int id)
+{
+	int ret = 0, i;
+	bool realloc = false;
+	struct kvm_vcpu *vcpu;
+	struct kvm *kvm = svm->vcpu.kvm;
+	struct svm_vm_data *vm_data = kvm->arch.arch_data;
+
+	mutex_lock(&kvm->slots_lock);
+
+	/* Check if we have already allocated vAPIC backing
+	 * page for this vCPU. If not, we need to realloc
+	 * a new one and re-assign all other vCPU.
+	 */
+	if (kvm->arch.apic_access_page_done &&
+	    (id > vm_data->avic_max_vcpu_id)) {
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			avic_unalloc_bk_page(vcpu);
+
+		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+					0, 0);
+		realloc = true;
+		vm_data->avic_max_vcpu_id = 0;
+	}
+
+	/*
+	 * We are allocating vAPIC backing page
+	 * upto the max vCPU ID
+	 */
+	if (id >= vm_data->avic_max_vcpu_id) {
+		ret = __x86_set_memory_region(kvm,
+					      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+					      APIC_DEFAULT_PHYS_BASE,
+					      PAGE_SIZE * (id + 1));
+		if (ret)
+			goto out;
+
+		vm_data->avic_max_vcpu_id = id;
+	}
+
+	/* Reinit vAPIC backing page for exisinting vcpus */
+	if (realloc)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			avic_init_bk_page(vcpu);
+
+	avic_init_bk_page(&svm->vcpu);
+
+	kvm->arch.apic_access_page_done = true;
+
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+static void avic_vm_uninit(struct kvm *kvm)
+{
+	struct svm_vm_data *vm_data = kvm->arch.arch_data;
+
+	if (!vm_data)
+		return;
+
+	if (vm_data->avic_log_ait_page)
+		__free_page(vm_data->avic_log_ait_page);
+	if (vm_data->avic_phy_ait_page)
+		__free_page(vm_data->avic_phy_ait_page);
+	kfree(vm_data);
+	kvm->arch.arch_data = NULL;
+}
+
+static void avic_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
+
+	if (!avic)
+		return;
+
+	avic_unalloc_bk_page(vcpu);
+	svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
+	svm->in_kernel_lapic_regs = NULL;
+
+	if (vm_data &&
+	    (atomic_read(&vm_data->count) == 0 ||
+	     atomic_dec_and_test(&vm_data->count)))
+		avic_vm_uninit(vcpu->kvm);
+}
+
+static atomic_t avic_tag_gen = ATOMIC_INIT(1);
+
+static inline u32 avic_get_next_tag(void)
+{
+	u32 tag = atomic_read(&avic_tag_gen);
+
+	atomic_inc(&avic_tag_gen);
+	return tag;
+}
+
+static int avic_vm_init(struct kvm *kvm)
+{
+	int err = -ENOMEM;
+	struct svm_vm_data *vm_data;
+	struct page *avic_phy_ait_page;
+	struct page *avic_log_ait_page;
+
+	vm_data = kzalloc(sizeof(struct svm_vm_data),
+				      GFP_KERNEL);
+	if (!vm_data)
+		return err;
+
+	kvm->arch.arch_data = vm_data;
+	atomic_set(&vm_data->count, 0);
+
+	/* Allocating physical APIC ID table (4KB) */
+	avic_phy_ait_page = alloc_page(GFP_KERNEL);
+	if (!avic_phy_ait_page)
+		goto free_avic;
+
+	vm_data->avic_phy_ait_page = avic_phy_ait_page;
+	clear_page(page_address(avic_phy_ait_page));
+
+	/* Allocating logical APIC ID table (4KB) */
+	avic_log_ait_page = alloc_page(GFP_KERNEL);
+	if (!avic_log_ait_page)
+		goto free_avic;
+
+	vm_data->avic_log_ait_page = avic_log_ait_page;
+	clear_page(page_address(avic_log_ait_page));
+
+	vm_data->avic_tag = avic_get_next_tag();
+
+	return 0;
+
+free_avic:
+	avic_vm_uninit(kvm);
+	return err;
+}
+
+static int avic_vcpu_init(struct kvm *kvm, struct vcpu_svm *svm, int id)
+{
+	int err;
+	struct svm_vm_data *vm_data = NULL;
+
+	/* Note: svm_vm_data is per VM */
+	if (!kvm->arch.arch_data) {
+		err = avic_vm_init(kvm);
+		if (err)
+			return err;
+	}
+
+	err = avic_alloc_bk_page(svm, id);
+	if (err) {
+		avic_vcpu_uninit(&svm->vcpu);
+		return err;
+	}
+
+	vm_data = kvm->arch.arch_data;
+	atomic_inc(&vm_data->count);
+
+	return 0;
 }
 
 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -1131,6 +1510,9 @@  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+
+	if (avic && !init_event)
+		avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -1169,6 +1551,12 @@  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!hsave_page)
 		goto free_page3;
 
+	if (avic) {
+		err = avic_vcpu_init(kvm, svm, id);
+		if (err)
+			goto free_page4;
+	}
+
 	svm->nested.hsave = page_address(hsave_page);
 
 	svm->msrpm = page_address(msrpm_pages);
@@ -1187,6 +1575,8 @@  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	return &svm->vcpu;
 
+free_page4:
+	__free_page(hsave_page);
 free_page3:
 	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
 free_page2:
@@ -1209,6 +1599,7 @@  static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
 	__free_page(virt_to_page(svm->nested.hsave));
 	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
+	avic_vcpu_uninit(vcpu);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -3382,6 +3773,7 @@  static void dump_vmcb(struct kvm_vcpu *vcpu)
 	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
 	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
 	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
 	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
 	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
 	pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
@@ -3613,11 +4005,31 @@  static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 
 static bool svm_get_enable_apicv(void)
 {
-	return false;
+	return avic;
+}
+
+static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
 }
 
+static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
+{
+}
+
+/* Note: Currently only used by Hyper-V. */
 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+
+	if (!avic)
+		return;
+
+	if (!svm->in_kernel_lapic_regs)
+		return;
+
+	svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
+	vmcb->control.avic_enable = 0;
 }
 
 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -4387,6 +4799,8 @@  static struct kvm_x86_ops svm_x86_ops = {
 	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
+	.hwapic_irr_update = svm_hwapic_irr_update,
+	.hwapic_isr_update = svm_hwapic_isr_update,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,