diff mbox series

[Part2,v5,21/45] KVM: SVM: Make AVIC backing, VMSA and VMCB memory allocation SNP safe

Message ID 20210820155918.7518-22-brijesh.singh@amd.com (mailing list archive)
State New, archived
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support | expand

Commit Message

Brijesh Singh Aug. 20, 2021, 3:58 p.m. UTC
Implement a workaround for an SNP erratum where the CPU will incorrectly
signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the
RMP entry of a VMCB, VMSA or AVIC backing page.

When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC
backing   pages as "in-use" in the RMP after a successful VMRUN.  This is
done for _all_ VMs, not just SNP-Active VMs.

If the hypervisor accesses an in-use page through a writable translation,
the CPU will throw an RMP violation #PF.  On early SNP hardware, if an
in-use page is 2mb aligned and software accesses any part of the associated
2mb region with a hupage, the CPU will incorrectly treat the entire 2mb
region as in-use and signal a spurious RMP violation #PF.

The recommended is to not use the hugepage for the VMCB, VMSA or
AVIC backing page. Add a generic allocator that will ensure that the page
returns is not hugepage (2mb or 1gb) and is safe to be used when SEV-SNP
is enabled.

Co-developed-by: Marc Orr <marcorr@google.com>
Signed-off-by: Marc Orr <marcorr@google.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/lapic.c               |  5 ++++-
 arch/x86/kvm/svm/sev.c             | 35 ++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c             | 16 ++++++++++++--
 arch/x86/kvm/svm/svm.h             |  1 +
 6 files changed, 56 insertions(+), 3 deletions(-)

Comments

Dr. David Alan Gilbert Sept. 22, 2021, 6:55 p.m. UTC | #1
* Brijesh Singh (brijesh.singh@amd.com) wrote:
> Implement a workaround for an SNP erratum where the CPU will incorrectly
> signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the
> RMP entry of a VMCB, VMSA or AVIC backing page.
> 
> When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC
> backing   pages as "in-use" in the RMP after a successful VMRUN.  This is
> done for _all_ VMs, not just SNP-Active VMs.

Can you explain what 'globally enabled' means?

Or more specifically, can we trip this bug on public hardware that has
the SNP enabled in the bios, but no SNP init in the host OS?

Dave

> If the hypervisor accesses an in-use page through a writable translation,
> the CPU will throw an RMP violation #PF.  On early SNP hardware, if an
> in-use page is 2mb aligned and software accesses any part of the associated
> 2mb region with a hupage, the CPU will incorrectly treat the entire 2mb
> region as in-use and signal a spurious RMP violation #PF.
> 
> The recommended is to not use the hugepage for the VMCB, VMSA or
> AVIC backing page. Add a generic allocator that will ensure that the page
> returns is not hugepage (2mb or 1gb) and is safe to be used when SEV-SNP
> is enabled.
> 
> Co-developed-by: Marc Orr <marcorr@google.com>
> Signed-off-by: Marc Orr <marcorr@google.com>
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
>  arch/x86/include/asm/kvm_host.h    |  1 +
>  arch/x86/kvm/lapic.c               |  5 ++++-
>  arch/x86/kvm/svm/sev.c             | 35 ++++++++++++++++++++++++++++++
>  arch/x86/kvm/svm/svm.c             | 16 ++++++++++++--
>  arch/x86/kvm/svm/svm.h             |  1 +
>  6 files changed, 56 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index a12a4987154e..36a9c23a4b27 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -122,6 +122,7 @@ KVM_X86_OP_NULL(enable_direct_tlbflush)
>  KVM_X86_OP_NULL(migrate_timers)
>  KVM_X86_OP(msr_filter_changed)
>  KVM_X86_OP_NULL(complete_emulated_msr)
> +KVM_X86_OP(alloc_apic_backing_page)
>  
>  #undef KVM_X86_OP
>  #undef KVM_X86_OP_NULL
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 974cbfb1eefe..5ad6255ff5d5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1453,6 +1453,7 @@ struct kvm_x86_ops {
>  	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
>  
>  	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
> +	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
>  };
>  
>  struct kvm_x86_nested_ops {
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index ba5a27879f1d..05b45747b20b 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2457,7 +2457,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
>  
>  	vcpu->arch.apic = apic;
>  
> -	apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> +	if (kvm_x86_ops.alloc_apic_backing_page)
> +		apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
> +	else
> +		apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
>  	if (!apic->regs) {
>  		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
>  		       vcpu->vcpu_id);
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 1644da5fc93f..8771b878193f 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -2703,3 +2703,38 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>  		break;
>  	}
>  }
> +
> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long pfn;
> +	struct page *p;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> +		return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +
> +	/*
> +	 * Allocate an SNP safe page to workaround the SNP erratum where
> +	 * the CPU will incorrectly signal an RMP violation  #PF if a
> +	 * hugepage (2mb or 1gb) collides with the RMP entry of VMCB, VMSA
> +	 * or AVIC backing page. The recommeded workaround is to not use the
> +	 * hugepage.
> +	 *
> +	 * Allocate one extra page, use a page which is not 2mb aligned
> +	 * and free the other.
> +	 */
> +	p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
> +	if (!p)
> +		return NULL;
> +
> +	split_page(p, 1);
> +
> +	pfn = page_to_pfn(p);
> +	if (IS_ALIGNED(__pfn_to_phys(pfn), PMD_SIZE)) {
> +		pfn++;
> +		__free_page(p);
> +	} else {
> +		__free_page(pfn_to_page(pfn + 1));
> +	}
> +
> +	return pfn_to_page(pfn);
> +}
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 25773bf72158..058eea8353c9 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1368,7 +1368,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>  	svm = to_svm(vcpu);
>  
>  	err = -ENOMEM;
> -	vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +	vmcb01_page = snp_safe_alloc_page(vcpu);
>  	if (!vmcb01_page)
>  		goto out;
>  
> @@ -1377,7 +1377,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>  		 * SEV-ES guests require a separate VMSA page used to contain
>  		 * the encrypted register state of the guest.
>  		 */
> -		vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +		vmsa_page = snp_safe_alloc_page(vcpu);
>  		if (!vmsa_page)
>  			goto error_free_vmcb_page;
>  
> @@ -4539,6 +4539,16 @@ static int svm_vm_init(struct kvm *kvm)
>  	return 0;
>  }
>  
> +static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
> +{
> +	struct page *page = snp_safe_alloc_page(vcpu);
> +
> +	if (!page)
> +		return NULL;
> +
> +	return page_address(page);
> +}
> +
>  static struct kvm_x86_ops svm_x86_ops __initdata = {
>  	.hardware_unsetup = svm_hardware_teardown,
>  	.hardware_enable = svm_hardware_enable,
> @@ -4667,6 +4677,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  	.complete_emulated_msr = svm_complete_emulated_msr,
>  
>  	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
> +
> +	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
>  };
>  
>  static struct kvm_x86_init_ops svm_init_ops __initdata = {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index d1f1512a4b47..e40800e9c998 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -575,6 +575,7 @@ void sev_es_create_vcpu(struct vcpu_svm *svm);
>  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
>  void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
>  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
>  
>  /* vmenter.S */
>  
> -- 
> 2.17.1
> 
>
Brijesh Singh Sept. 23, 2021, 6:09 p.m. UTC | #2
On 9/22/21 1:55 PM, Dr. David Alan Gilbert wrote:
> * Brijesh Singh (brijesh.singh@amd.com) wrote:
>> Implement a workaround for an SNP erratum where the CPU will incorrectly
>> signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the
>> RMP entry of a VMCB, VMSA or AVIC backing page.
>>
>> When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC
>> backing   pages as "in-use" in the RMP after a successful VMRUN.  This is
>> done for _all_ VMs, not just SNP-Active VMs.
> Can you explain what 'globally enabled' means?

This means that SNP is enabled in  host SYSCFG_MSR.Snp=1. Once its
enabled then RMP checks are enforced.


> Or more specifically, can we trip this bug on public hardware that has
> the SNP enabled in the bios, but no SNP init in the host OS?

Enabling the SNP support on host is 3 step process:

step1 (bios): reserve memory for the RMP table.

step2 (host): initialize the RMP table memory, set the SYSCFG msr to
enable the SNP feature

step3 (host): call the SNP_INIT to initialize the SNP firmware (this is
needed only if you ever plan to launch SNP guest from this host).

The "SNP globally enabled" means the step 1 to 2. The RMP checks are
enforced as soon as step 2 is completed.

thanks

>
> Dave
>
>> If the hypervisor accesses an in-use page through a writable translation,
>> the CPU will throw an RMP violation #PF.  On early SNP hardware, if an
>> in-use page is 2mb aligned and software accesses any part of the associated
>> 2mb region with a hupage, the CPU will incorrectly treat the entire 2mb
>> region as in-use and signal a spurious RMP violation #PF.
>>
>> The recommended is to not use the hugepage for the VMCB, VMSA or
>> AVIC backing page. Add a generic allocator that will ensure that the page
>> returns is not hugepage (2mb or 1gb) and is safe to be used when SEV-SNP
>> is enabled.
>>
>> Co-developed-by: Marc Orr <marcorr@google.com>
>> Signed-off-by: Marc Orr <marcorr@google.com>
>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>> ---
>>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
>>  arch/x86/include/asm/kvm_host.h    |  1 +
>>  arch/x86/kvm/lapic.c               |  5 ++++-
>>  arch/x86/kvm/svm/sev.c             | 35 ++++++++++++++++++++++++++++++
>>  arch/x86/kvm/svm/svm.c             | 16 ++++++++++++--
>>  arch/x86/kvm/svm/svm.h             |  1 +
>>  6 files changed, 56 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
>> index a12a4987154e..36a9c23a4b27 100644
>> --- a/arch/x86/include/asm/kvm-x86-ops.h
>> +++ b/arch/x86/include/asm/kvm-x86-ops.h
>> @@ -122,6 +122,7 @@ KVM_X86_OP_NULL(enable_direct_tlbflush)
>>  KVM_X86_OP_NULL(migrate_timers)
>>  KVM_X86_OP(msr_filter_changed)
>>  KVM_X86_OP_NULL(complete_emulated_msr)
>> +KVM_X86_OP(alloc_apic_backing_page)
>>  
>>  #undef KVM_X86_OP
>>  #undef KVM_X86_OP_NULL
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 974cbfb1eefe..5ad6255ff5d5 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1453,6 +1453,7 @@ struct kvm_x86_ops {
>>  	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
>>  
>>  	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
>> +	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
>>  };
>>  
>>  struct kvm_x86_nested_ops {
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index ba5a27879f1d..05b45747b20b 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -2457,7 +2457,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
>>  
>>  	vcpu->arch.apic = apic;
>>  
>> -	apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
>> +	if (kvm_x86_ops.alloc_apic_backing_page)
>> +		apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
>> +	else
>> +		apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
>>  	if (!apic->regs) {
>>  		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
>>  		       vcpu->vcpu_id);
>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>> index 1644da5fc93f..8771b878193f 100644
>> --- a/arch/x86/kvm/svm/sev.c
>> +++ b/arch/x86/kvm/svm/sev.c
>> @@ -2703,3 +2703,38 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>>  		break;
>>  	}
>>  }
>> +
>> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long pfn;
>> +	struct page *p;
>> +
>> +	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
>> +		return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>> +
>> +	/*
>> +	 * Allocate an SNP safe page to workaround the SNP erratum where
>> +	 * the CPU will incorrectly signal an RMP violation  #PF if a
>> +	 * hugepage (2mb or 1gb) collides with the RMP entry of VMCB, VMSA
>> +	 * or AVIC backing page. The recommeded workaround is to not use the
>> +	 * hugepage.
>> +	 *
>> +	 * Allocate one extra page, use a page which is not 2mb aligned
>> +	 * and free the other.
>> +	 */
>> +	p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
>> +	if (!p)
>> +		return NULL;
>> +
>> +	split_page(p, 1);
>> +
>> +	pfn = page_to_pfn(p);
>> +	if (IS_ALIGNED(__pfn_to_phys(pfn), PMD_SIZE)) {
>> +		pfn++;
>> +		__free_page(p);
>> +	} else {
>> +		__free_page(pfn_to_page(pfn + 1));
>> +	}
>> +
>> +	return pfn_to_page(pfn);
>> +}
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 25773bf72158..058eea8353c9 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -1368,7 +1368,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>>  	svm = to_svm(vcpu);
>>  
>>  	err = -ENOMEM;
>> -	vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>> +	vmcb01_page = snp_safe_alloc_page(vcpu);
>>  	if (!vmcb01_page)
>>  		goto out;
>>  
>> @@ -1377,7 +1377,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
>>  		 * SEV-ES guests require a separate VMSA page used to contain
>>  		 * the encrypted register state of the guest.
>>  		 */
>> -		vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>> +		vmsa_page = snp_safe_alloc_page(vcpu);
>>  		if (!vmsa_page)
>>  			goto error_free_vmcb_page;
>>  
>> @@ -4539,6 +4539,16 @@ static int svm_vm_init(struct kvm *kvm)
>>  	return 0;
>>  }
>>  
>> +static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
>> +{
>> +	struct page *page = snp_safe_alloc_page(vcpu);
>> +
>> +	if (!page)
>> +		return NULL;
>> +
>> +	return page_address(page);
>> +}
>> +
>>  static struct kvm_x86_ops svm_x86_ops __initdata = {
>>  	.hardware_unsetup = svm_hardware_teardown,
>>  	.hardware_enable = svm_hardware_enable,
>> @@ -4667,6 +4677,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>>  	.complete_emulated_msr = svm_complete_emulated_msr,
>>  
>>  	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
>> +
>> +	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
>>  };
>>  
>>  static struct kvm_x86_init_ops svm_init_ops __initdata = {
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index d1f1512a4b47..e40800e9c998 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -575,6 +575,7 @@ void sev_es_create_vcpu(struct vcpu_svm *svm);
>>  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
>>  void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
>>  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
>> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
>>  
>>  /* vmenter.S */
>>  
>> -- 
>> 2.17.1
>>
>>
Dr. David Alan Gilbert Sept. 23, 2021, 6:39 p.m. UTC | #3
* Brijesh Singh (brijesh.singh@amd.com) wrote:
> 
> On 9/22/21 1:55 PM, Dr. David Alan Gilbert wrote:
> > * Brijesh Singh (brijesh.singh@amd.com) wrote:
> >> Implement a workaround for an SNP erratum where the CPU will incorrectly
> >> signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the
> >> RMP entry of a VMCB, VMSA or AVIC backing page.
> >>
> >> When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC
> >> backing   pages as "in-use" in the RMP after a successful VMRUN.  This is
> >> done for _all_ VMs, not just SNP-Active VMs.
> > Can you explain what 'globally enabled' means?
> 
> This means that SNP is enabled in  host SYSCFG_MSR.Snp=1. Once its
> enabled then RMP checks are enforced.
> 
> 
> > Or more specifically, can we trip this bug on public hardware that has
> > the SNP enabled in the bios, but no SNP init in the host OS?
> 
> Enabling the SNP support on host is 3 step process:
> 
> step1 (bios): reserve memory for the RMP table.
> 
> step2 (host): initialize the RMP table memory, set the SYSCFG msr to
> enable the SNP feature
> 
> step3 (host): call the SNP_INIT to initialize the SNP firmware (this is
> needed only if you ever plan to launch SNP guest from this host).
> 
> The "SNP globally enabled" means the step 1 to 2. The RMP checks are
> enforced as soon as step 2 is completed.

So I think that means we don't need to backport this to older kernels
that don't know about SNP but might run on SNP enabled hardware (1), since
those kernels won't do step2.

Dave

> thanks
> 
> >
> > Dave
> >
> >> If the hypervisor accesses an in-use page through a writable translation,
> >> the CPU will throw an RMP violation #PF.  On early SNP hardware, if an
> >> in-use page is 2mb aligned and software accesses any part of the associated
> >> 2mb region with a hupage, the CPU will incorrectly treat the entire 2mb
> >> region as in-use and signal a spurious RMP violation #PF.
> >>
> >> The recommended is to not use the hugepage for the VMCB, VMSA or
> >> AVIC backing page. Add a generic allocator that will ensure that the page
> >> returns is not hugepage (2mb or 1gb) and is safe to be used when SEV-SNP
> >> is enabled.
> >>
> >> Co-developed-by: Marc Orr <marcorr@google.com>
> >> Signed-off-by: Marc Orr <marcorr@google.com>
> >> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> >> ---
> >>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
> >>  arch/x86/include/asm/kvm_host.h    |  1 +
> >>  arch/x86/kvm/lapic.c               |  5 ++++-
> >>  arch/x86/kvm/svm/sev.c             | 35 ++++++++++++++++++++++++++++++
> >>  arch/x86/kvm/svm/svm.c             | 16 ++++++++++++--
> >>  arch/x86/kvm/svm/svm.h             |  1 +
> >>  6 files changed, 56 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> >> index a12a4987154e..36a9c23a4b27 100644
> >> --- a/arch/x86/include/asm/kvm-x86-ops.h
> >> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> >> @@ -122,6 +122,7 @@ KVM_X86_OP_NULL(enable_direct_tlbflush)
> >>  KVM_X86_OP_NULL(migrate_timers)
> >>  KVM_X86_OP(msr_filter_changed)
> >>  KVM_X86_OP_NULL(complete_emulated_msr)
> >> +KVM_X86_OP(alloc_apic_backing_page)
> >>  
> >>  #undef KVM_X86_OP
> >>  #undef KVM_X86_OP_NULL
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 974cbfb1eefe..5ad6255ff5d5 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -1453,6 +1453,7 @@ struct kvm_x86_ops {
> >>  	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
> >>  
> >>  	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
> >> +	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
> >>  };
> >>  
> >>  struct kvm_x86_nested_ops {
> >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >> index ba5a27879f1d..05b45747b20b 100644
> >> --- a/arch/x86/kvm/lapic.c
> >> +++ b/arch/x86/kvm/lapic.c
> >> @@ -2457,7 +2457,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
> >>  
> >>  	vcpu->arch.apic = apic;
> >>  
> >> -	apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> >> +	if (kvm_x86_ops.alloc_apic_backing_page)
> >> +		apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
> >> +	else
> >> +		apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> >>  	if (!apic->regs) {
> >>  		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
> >>  		       vcpu->vcpu_id);
> >> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> >> index 1644da5fc93f..8771b878193f 100644
> >> --- a/arch/x86/kvm/svm/sev.c
> >> +++ b/arch/x86/kvm/svm/sev.c
> >> @@ -2703,3 +2703,38 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
> >>  		break;
> >>  	}
> >>  }
> >> +
> >> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
> >> +{
> >> +	unsigned long pfn;
> >> +	struct page *p;
> >> +
> >> +	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> >> +		return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> >> +
> >> +	/*
> >> +	 * Allocate an SNP safe page to workaround the SNP erratum where
> >> +	 * the CPU will incorrectly signal an RMP violation  #PF if a
> >> +	 * hugepage (2mb or 1gb) collides with the RMP entry of VMCB, VMSA
> >> +	 * or AVIC backing page. The recommeded workaround is to not use the
> >> +	 * hugepage.
> >> +	 *
> >> +	 * Allocate one extra page, use a page which is not 2mb aligned
> >> +	 * and free the other.
> >> +	 */
> >> +	p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
> >> +	if (!p)
> >> +		return NULL;
> >> +
> >> +	split_page(p, 1);
> >> +
> >> +	pfn = page_to_pfn(p);
> >> +	if (IS_ALIGNED(__pfn_to_phys(pfn), PMD_SIZE)) {
> >> +		pfn++;
> >> +		__free_page(p);
> >> +	} else {
> >> +		__free_page(pfn_to_page(pfn + 1));
> >> +	}
> >> +
> >> +	return pfn_to_page(pfn);
> >> +}
> >> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> >> index 25773bf72158..058eea8353c9 100644
> >> --- a/arch/x86/kvm/svm/svm.c
> >> +++ b/arch/x86/kvm/svm/svm.c
> >> @@ -1368,7 +1368,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> >>  	svm = to_svm(vcpu);
> >>  
> >>  	err = -ENOMEM;
> >> -	vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> >> +	vmcb01_page = snp_safe_alloc_page(vcpu);
> >>  	if (!vmcb01_page)
> >>  		goto out;
> >>  
> >> @@ -1377,7 +1377,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> >>  		 * SEV-ES guests require a separate VMSA page used to contain
> >>  		 * the encrypted register state of the guest.
> >>  		 */
> >> -		vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> >> +		vmsa_page = snp_safe_alloc_page(vcpu);
> >>  		if (!vmsa_page)
> >>  			goto error_free_vmcb_page;
> >>  
> >> @@ -4539,6 +4539,16 @@ static int svm_vm_init(struct kvm *kvm)
> >>  	return 0;
> >>  }
> >>  
> >> +static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
> >> +{
> >> +	struct page *page = snp_safe_alloc_page(vcpu);
> >> +
> >> +	if (!page)
> >> +		return NULL;
> >> +
> >> +	return page_address(page);
> >> +}
> >> +
> >>  static struct kvm_x86_ops svm_x86_ops __initdata = {
> >>  	.hardware_unsetup = svm_hardware_teardown,
> >>  	.hardware_enable = svm_hardware_enable,
> >> @@ -4667,6 +4677,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
> >>  	.complete_emulated_msr = svm_complete_emulated_msr,
> >>  
> >>  	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
> >> +
> >> +	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
> >>  };
> >>  
> >>  static struct kvm_x86_init_ops svm_init_ops __initdata = {
> >> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> >> index d1f1512a4b47..e40800e9c998 100644
> >> --- a/arch/x86/kvm/svm/svm.h
> >> +++ b/arch/x86/kvm/svm/svm.h
> >> @@ -575,6 +575,7 @@ void sev_es_create_vcpu(struct vcpu_svm *svm);
> >>  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
> >>  void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
> >>  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
> >> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
> >>  
> >>  /* vmenter.S */
> >>  
> >> -- 
> >> 2.17.1
> >>
> >>
>
Marc Orr Sept. 23, 2021, 7:17 p.m. UTC | #4
On Thu, Sep 23, 2021 at 11:09 AM Brijesh Singh <brijesh.singh@amd.com> wrote:
>
>
> On 9/22/21 1:55 PM, Dr. David Alan Gilbert wrote:
> > * Brijesh Singh (brijesh.singh@amd.com) wrote:
> >> Implement a workaround for an SNP erratum where the CPU will incorrectly
> >> signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the
> >> RMP entry of a VMCB, VMSA or AVIC backing page.
> >>
> >> When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC
> >> backing   pages as "in-use" in the RMP after a successful VMRUN.  This is
> >> done for _all_ VMs, not just SNP-Active VMs.
> > Can you explain what 'globally enabled' means?
>
> This means that SNP is enabled in  host SYSCFG_MSR.Snp=1. Once its
> enabled then RMP checks are enforced.
>
>
> > Or more specifically, can we trip this bug on public hardware that has
> > the SNP enabled in the bios, but no SNP init in the host OS?
>
> Enabling the SNP support on host is 3 step process:
>
> step1 (bios): reserve memory for the RMP table.
>
> step2 (host): initialize the RMP table memory, set the SYSCFG msr to
> enable the SNP feature
>
> step3 (host): call the SNP_INIT to initialize the SNP firmware (this is
> needed only if you ever plan to launch SNP guest from this host).
>
> The "SNP globally enabled" means the step 1 to 2. The RMP checks are
> enforced as soon as step 2 is completed.

It might be good to update the code to reflect this reply, such that
the new `snp_safe_alloc_page()` function introduced in this patch only
deviates from "normal" page allocation logic when these two conditions
are met. We could introduce a global variable, `snp_globally_enabled`
that defaults to false and gets set to true after we write the SYSCFG
MSR.

> thanks
>
> >
> > Dave
> >
> >> If the hypervisor accesses an in-use page through a writable translation,
> >> the CPU will throw an RMP violation #PF.  On early SNP hardware, if an
> >> in-use page is 2mb aligned and software accesses any part of the associated
> >> 2mb region with a hupage, the CPU will incorrectly treat the entire 2mb
> >> region as in-use and signal a spurious RMP violation #PF.
> >>
> >> The recommended is to not use the hugepage for the VMCB, VMSA or
> >> AVIC backing page. Add a generic allocator that will ensure that the page
> >> returns is not hugepage (2mb or 1gb) and is safe to be used when SEV-SNP
> >> is enabled.
> >>
> >> Co-developed-by: Marc Orr <marcorr@google.com>
> >> Signed-off-by: Marc Orr <marcorr@google.com>
> >> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> >> ---
> >>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
> >>  arch/x86/include/asm/kvm_host.h    |  1 +
> >>  arch/x86/kvm/lapic.c               |  5 ++++-
> >>  arch/x86/kvm/svm/sev.c             | 35 ++++++++++++++++++++++++++++++
> >>  arch/x86/kvm/svm/svm.c             | 16 ++++++++++++--
> >>  arch/x86/kvm/svm/svm.h             |  1 +
> >>  6 files changed, 56 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> >> index a12a4987154e..36a9c23a4b27 100644
> >> --- a/arch/x86/include/asm/kvm-x86-ops.h
> >> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> >> @@ -122,6 +122,7 @@ KVM_X86_OP_NULL(enable_direct_tlbflush)
> >>  KVM_X86_OP_NULL(migrate_timers)
> >>  KVM_X86_OP(msr_filter_changed)
> >>  KVM_X86_OP_NULL(complete_emulated_msr)
> >> +KVM_X86_OP(alloc_apic_backing_page)
> >>
> >>  #undef KVM_X86_OP
> >>  #undef KVM_X86_OP_NULL
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 974cbfb1eefe..5ad6255ff5d5 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -1453,6 +1453,7 @@ struct kvm_x86_ops {
> >>      int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
> >>
> >>      void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
> >> +    void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
> >>  };
> >>
> >>  struct kvm_x86_nested_ops {
> >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> >> index ba5a27879f1d..05b45747b20b 100644
> >> --- a/arch/x86/kvm/lapic.c
> >> +++ b/arch/x86/kvm/lapic.c
> >> @@ -2457,7 +2457,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
> >>
> >>      vcpu->arch.apic = apic;
> >>
> >> -    apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> >> +    if (kvm_x86_ops.alloc_apic_backing_page)
> >> +            apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
> >> +    else
> >> +            apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> >>      if (!apic->regs) {
> >>              printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
> >>                     vcpu->vcpu_id);
> >> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> >> index 1644da5fc93f..8771b878193f 100644
> >> --- a/arch/x86/kvm/svm/sev.c
> >> +++ b/arch/x86/kvm/svm/sev.c
> >> @@ -2703,3 +2703,38 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
> >>              break;
> >>      }
> >>  }
> >> +
> >> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
> >> +{
> >> +    unsigned long pfn;
> >> +    struct page *p;
> >> +
> >> +    if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> >> +            return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);

Continuing my other comment, above: if we introduce a
`snp_globally_enabled` var, we could use that here, rather than
`cpu_feature_enabled(X86_FEATURE_SEV_SNP)`.

> >> +
> >> +    /*
> >> +     * Allocate an SNP safe page to workaround the SNP erratum where
> >> +     * the CPU will incorrectly signal an RMP violation  #PF if a
> >> +     * hugepage (2mb or 1gb) collides with the RMP entry of VMCB, VMSA
> >> +     * or AVIC backing page. The recommeded workaround is to not use the
> >> +     * hugepage.
> >> +     *
> >> +     * Allocate one extra page, use a page which is not 2mb aligned
> >> +     * and free the other.
> >> +     */
> >> +    p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
> >> +    if (!p)
> >> +            return NULL;
> >> +
> >> +    split_page(p, 1);
> >> +
> >> +    pfn = page_to_pfn(p);
> >> +    if (IS_ALIGNED(__pfn_to_phys(pfn), PMD_SIZE)) {
> >> +            pfn++;
> >> +            __free_page(p);
> >> +    } else {
> >> +            __free_page(pfn_to_page(pfn + 1));
> >> +    }
> >> +
> >> +    return pfn_to_page(pfn);
> >> +}
> >> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> >> index 25773bf72158..058eea8353c9 100644
> >> --- a/arch/x86/kvm/svm/svm.c
> >> +++ b/arch/x86/kvm/svm/svm.c
> >> @@ -1368,7 +1368,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> >>      svm = to_svm(vcpu);
> >>
> >>      err = -ENOMEM;
> >> -    vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> >> +    vmcb01_page = snp_safe_alloc_page(vcpu);
> >>      if (!vmcb01_page)
> >>              goto out;
> >>
> >> @@ -1377,7 +1377,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
> >>               * SEV-ES guests require a separate VMSA page used to contain
> >>               * the encrypted register state of the guest.
> >>               */
> >> -            vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> >> +            vmsa_page = snp_safe_alloc_page(vcpu);
> >>              if (!vmsa_page)
> >>                      goto error_free_vmcb_page;
> >>
> >> @@ -4539,6 +4539,16 @@ static int svm_vm_init(struct kvm *kvm)
> >>      return 0;
> >>  }
> >>
> >> +static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
> >> +{
> >> +    struct page *page = snp_safe_alloc_page(vcpu);
> >> +
> >> +    if (!page)
> >> +            return NULL;
> >> +
> >> +    return page_address(page);
> >> +}
> >> +
> >>  static struct kvm_x86_ops svm_x86_ops __initdata = {
> >>      .hardware_unsetup = svm_hardware_teardown,
> >>      .hardware_enable = svm_hardware_enable,
> >> @@ -4667,6 +4677,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
> >>      .complete_emulated_msr = svm_complete_emulated_msr,
> >>
> >>      .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
> >> +
> >> +    .alloc_apic_backing_page = svm_alloc_apic_backing_page,
> >>  };
> >>
> >>  static struct kvm_x86_init_ops svm_init_ops __initdata = {
> >> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> >> index d1f1512a4b47..e40800e9c998 100644
> >> --- a/arch/x86/kvm/svm/svm.h
> >> +++ b/arch/x86/kvm/svm/svm.h
> >> @@ -575,6 +575,7 @@ void sev_es_create_vcpu(struct vcpu_svm *svm);
> >>  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
> >>  void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
> >>  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
> >> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
> >>
> >>  /* vmenter.S */
> >>
> >> --
> >> 2.17.1
> >>
> >>
Brijesh Singh Sept. 23, 2021, 8:44 p.m. UTC | #5
On 9/23/21 2:17 PM, Marc Orr wrote:

>>>> +
>>>> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +    unsigned long pfn;
>>>> +    struct page *p;
>>>> +
>>>> +    if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
>>>> +            return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> Continuing my other comment, above: if we introduce a
> `snp_globally_enabled` var, we could use that here, rather than
> `cpu_feature_enabled(X86_FEATURE_SEV_SNP)`.


Maybe I am missing something, what is wrong with
cpu_feature_enabled(...) check ? It's same as creating a global
variable. The feature enabled bit is not set if the said is not
enabled.  See the patch #3 [1] in this series.

[1]
https://lore.kernel.org/linux-mm/YUN+L0dlFMbC3bd4@zn.tnic/T/#m2ac1242b33abfcd0d9fb22a89f4c103eacf67ea7

thanks
Marc Orr Sept. 23, 2021, 8:55 p.m. UTC | #6
On Thu, Sep 23, 2021 at 1:44 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
>
>
> On 9/23/21 2:17 PM, Marc Orr wrote:
>
> >>>> +
> >>>> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
> >>>> +{
> >>>> +    unsigned long pfn;
> >>>> +    struct page *p;
> >>>> +
> >>>> +    if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
> >>>> +            return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> > Continuing my other comment, above: if we introduce a
> > `snp_globally_enabled` var, we could use that here, rather than
> > `cpu_feature_enabled(X86_FEATURE_SEV_SNP)`.
>
>
> Maybe I am missing something, what is wrong with
> cpu_feature_enabled(...) check ? It's same as creating a global
> variable. The feature enabled bit is not set if the said is not
> enabled.  See the patch #3 [1] in this series.
>
> [1]
> https://lore.kernel.org/linux-mm/YUN+L0dlFMbC3bd4@zn.tnic/T/#m2ac1242b33abfcd0d9fb22a89f4c103eacf67ea7
>
> thanks

You are right. Patch #3 does exactly what I was asking for in
`snp_rmptable_init()`. Thanks!
Brijesh Singh Sept. 23, 2021, 10:23 p.m. UTC | #7
On 9/23/21 1:39 PM, Dr. David Alan Gilbert wrote:
> * Brijesh Singh (brijesh.singh@amd.com) wrote:
>> On 9/22/21 1:55 PM, Dr. David Alan Gilbert wrote:
>>> * Brijesh Singh (brijesh.singh@amd.com) wrote:
>>>> Implement a workaround for an SNP erratum where the CPU will incorrectly
>>>> signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the
>>>> RMP entry of a VMCB, VMSA or AVIC backing page.
>>>>
>>>> When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC
>>>> backing   pages as "in-use" in the RMP after a successful VMRUN.  This is
>>>> done for _all_ VMs, not just SNP-Active VMs.
>>> Can you explain what 'globally enabled' means?
>> This means that SNP is enabled in  host SYSCFG_MSR.Snp=1. Once its
>> enabled then RMP checks are enforced.
>>
>>
>>> Or more specifically, can we trip this bug on public hardware that has
>>> the SNP enabled in the bios, but no SNP init in the host OS?
>> Enabling the SNP support on host is 3 step process:
>>
>> step1 (bios): reserve memory for the RMP table.
>>
>> step2 (host): initialize the RMP table memory, set the SYSCFG msr to
>> enable the SNP feature
>>
>> step3 (host): call the SNP_INIT to initialize the SNP firmware (this is
>> needed only if you ever plan to launch SNP guest from this host).
>>
>> The "SNP globally enabled" means the step 1 to 2. The RMP checks are
>> enforced as soon as step 2 is completed.
> So I think that means we don't need to backport this to older kernels
> that don't know about SNP but might run on SNP enabled hardware (1), since
> those kernels won't do step2.

Correct.

thanks
Sean Christopherson Oct. 12, 2021, 8:44 p.m. UTC | #8
On Fri, Aug 20, 2021, Brijesh Singh wrote:
> Implement a workaround for an SNP erratum where the CPU will incorrectly
> signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the
> RMP entry of a VMCB, VMSA or AVIC backing page.

...

> @@ -4539,6 +4539,16 @@ static int svm_vm_init(struct kvm *kvm)
>  	return 0;
>  }
>  
> +static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
> +{
> +	struct page *page = snp_safe_alloc_page(vcpu);
> +
> +	if (!page)
> +		return NULL;
> +
> +	return page_address(page);
> +}
> +
>  static struct kvm_x86_ops svm_x86_ops __initdata = {
>  	.hardware_unsetup = svm_hardware_teardown,
>  	.hardware_enable = svm_hardware_enable,
> @@ -4667,6 +4677,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  	.complete_emulated_msr = svm_complete_emulated_msr,
>  
>  	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
> +
> +	.alloc_apic_backing_page = svm_alloc_apic_backing_page,

IMO, this should be guarded by a module param or X86_BUG_* to make it clear that
this is a bug and not working as intended.

And doesn't the APIC page need these shenanigans iff AVIC is enabled? (the module
param, not necessarily in the VM)

>  };
>  
>  static struct kvm_x86_init_ops svm_init_ops __initdata = {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index d1f1512a4b47..e40800e9c998 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -575,6 +575,7 @@ void sev_es_create_vcpu(struct vcpu_svm *svm);
>  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
>  void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
>  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
> +struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
>  
>  /* vmenter.S */
>  
> -- 
> 2.17.1
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index a12a4987154e..36a9c23a4b27 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -122,6 +122,7 @@  KVM_X86_OP_NULL(enable_direct_tlbflush)
 KVM_X86_OP_NULL(migrate_timers)
 KVM_X86_OP(msr_filter_changed)
 KVM_X86_OP_NULL(complete_emulated_msr)
+KVM_X86_OP(alloc_apic_backing_page)
 
 #undef KVM_X86_OP
 #undef KVM_X86_OP_NULL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 974cbfb1eefe..5ad6255ff5d5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1453,6 +1453,7 @@  struct kvm_x86_ops {
 	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
 
 	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ba5a27879f1d..05b45747b20b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2457,7 +2457,10 @@  int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
 
 	vcpu->arch.apic = apic;
 
-	apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+	if (kvm_x86_ops.alloc_apic_backing_page)
+		apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
+	else
+		apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 	if (!apic->regs) {
 		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
 		       vcpu->vcpu_id);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 1644da5fc93f..8771b878193f 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2703,3 +2703,38 @@  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 		break;
 	}
 }
+
+struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
+{
+	unsigned long pfn;
+	struct page *p;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+		return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+
+	/*
+	 * Allocate an SNP safe page to workaround the SNP erratum where
+	 * the CPU will incorrectly signal an RMP violation  #PF if a
+	 * hugepage (2mb or 1gb) collides with the RMP entry of VMCB, VMSA
+	 * or AVIC backing page. The recommeded workaround is to not use the
+	 * hugepage.
+	 *
+	 * Allocate one extra page, use a page which is not 2mb aligned
+	 * and free the other.
+	 */
+	p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+	if (!p)
+		return NULL;
+
+	split_page(p, 1);
+
+	pfn = page_to_pfn(p);
+	if (IS_ALIGNED(__pfn_to_phys(pfn), PMD_SIZE)) {
+		pfn++;
+		__free_page(p);
+	} else {
+		__free_page(pfn_to_page(pfn + 1));
+	}
+
+	return pfn_to_page(pfn);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 25773bf72158..058eea8353c9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1368,7 +1368,7 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 	svm = to_svm(vcpu);
 
 	err = -ENOMEM;
-	vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	vmcb01_page = snp_safe_alloc_page(vcpu);
 	if (!vmcb01_page)
 		goto out;
 
@@ -1377,7 +1377,7 @@  static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 		 * SEV-ES guests require a separate VMSA page used to contain
 		 * the encrypted register state of the guest.
 		 */
-		vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+		vmsa_page = snp_safe_alloc_page(vcpu);
 		if (!vmsa_page)
 			goto error_free_vmcb_page;
 
@@ -4539,6 +4539,16 @@  static int svm_vm_init(struct kvm *kvm)
 	return 0;
 }
 
+static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
+{
+	struct page *page = snp_safe_alloc_page(vcpu);
+
+	if (!page)
+		return NULL;
+
+	return page_address(page);
+}
+
 static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.hardware_unsetup = svm_hardware_teardown,
 	.hardware_enable = svm_hardware_enable,
@@ -4667,6 +4677,8 @@  static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.complete_emulated_msr = svm_complete_emulated_msr,
 
 	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+
+	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
 };
 
 static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d1f1512a4b47..e40800e9c998 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -575,6 +575,7 @@  void sev_es_create_vcpu(struct vcpu_svm *svm);
 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
 void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
 void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
 
 /* vmenter.S */