diff mbox series

[v6,08/14] KVM: X86: Introduce KVM_HC_PAGE_ENC_STATUS hypercall

Message ID 265ef8a0ab75f01bc673cce6ddcf7988c7623943.1585548051.git.ashish.kalra@amd.com (mailing list archive)
State New, archived
Headers show
Series Add AMD SEV guest live migration support | expand

Commit Message

Kalra, Ashish March 30, 2020, 6:22 a.m. UTC
From: Brijesh Singh <Brijesh.Singh@amd.com>

This hypercall is used by the SEV guest to notify a change in the page
encryption status to the hypervisor. The hypercall should be invoked
only when the encryption attribute is changed from encrypted -> decrypted
and vice versa. By default all guest pages are considered encrypted.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 Documentation/virt/kvm/hypercalls.rst | 15 +++++
 arch/x86/include/asm/kvm_host.h       |  2 +
 arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.c                |  1 +
 arch/x86/kvm/x86.c                    |  6 ++
 include/uapi/linux/kvm_para.h         |  1 +
 6 files changed, 120 insertions(+)

Comments

Venu Busireddy April 3, 2020, midnight UTC | #1
On 2020-03-30 06:22:07 +0000, Ashish Kalra wrote:
> From: Brijesh Singh <Brijesh.Singh@amd.com>
> 
> This hypercall is used by the SEV guest to notify a change in the page
> encryption status to the hypervisor. The hypercall should be invoked
> only when the encryption attribute is changed from encrypted -> decrypted
> and vice versa. By default all guest pages are considered encrypted.
> 
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: x86@kernel.org
> Cc: kvm@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>

Reviewed-by: Venu Busireddy <venu.busireddy@oracle.com>

> ---
>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
>  arch/x86/include/asm/kvm_host.h       |  2 +
>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/vmx.c                |  1 +
>  arch/x86/kvm/x86.c                    |  6 ++
>  include/uapi/linux/kvm_para.h         |  1 +
>  6 files changed, 120 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> index dbaf207e560d..ff5287e68e81 100644
> --- a/Documentation/virt/kvm/hypercalls.rst
> +++ b/Documentation/virt/kvm/hypercalls.rst
> @@ -169,3 +169,18 @@ a0: destination APIC ID
>  
>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
>  	        any of the IPI target vCPUs was preempted.
> +
> +
> +8. KVM_HC_PAGE_ENC_STATUS
> +-------------------------
> +:Architecture: x86
> +:Status: active
> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> +
> +a0: the guest physical address of the start page
> +a1: the number of pages
> +a2: encryption attribute
> +
> +   Where:
> +	* 1: Encryption attribute is set
> +	* 0: Encryption attribute is cleared
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 98959e8cd448..90718fa3db47 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
>  
>  	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
>  	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> +	int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> +				  unsigned long sz, unsigned long mode);
>  };
>  
>  struct kvm_arch_async_pf {
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 7c2721e18b06..1d8beaf1bceb 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -136,6 +136,8 @@ struct kvm_sev_info {
>  	int fd;			/* SEV device fd */
>  	unsigned long pages_locked; /* Number of pages locked */
>  	struct list_head regions_list;  /* List of registered regions */
> +	unsigned long *page_enc_bmap;
> +	unsigned long page_enc_bmap_size;
>  };
>  
>  struct kvm_svm {
> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
>  
>  	sev_unbind_asid(kvm, sev->handle);
>  	sev_asid_free(sev->asid);
> +
> +	kvfree(sev->page_enc_bmap);
> +	sev->page_enc_bmap = NULL;
>  }
>  
>  static void avic_vm_destroy(struct kvm *kvm)
> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
>  	return ret;
>  }
>  
> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> +{
> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +	unsigned long *map;
> +	unsigned long sz;
> +
> +	if (sev->page_enc_bmap_size >= new_size)
> +		return 0;
> +
> +	sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> +
> +	map = vmalloc(sz);
> +	if (!map) {
> +		pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> +				sz);
> +		return -ENOMEM;
> +	}
> +
> +	/* mark the page encrypted (by default) */
> +	memset(map, 0xff, sz);
> +
> +	bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> +	kvfree(sev->page_enc_bmap);
> +
> +	sev->page_enc_bmap = map;
> +	sev->page_enc_bmap_size = new_size;
> +
> +	return 0;
> +}
> +
> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> +				  unsigned long npages, unsigned long enc)
> +{
> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +	kvm_pfn_t pfn_start, pfn_end;
> +	gfn_t gfn_start, gfn_end;
> +	int ret;
> +
> +	if (!sev_guest(kvm))
> +		return -EINVAL;
> +
> +	if (!npages)
> +		return 0;
> +
> +	gfn_start = gpa_to_gfn(gpa);
> +	gfn_end = gfn_start + npages;
> +
> +	/* out of bound access error check */
> +	if (gfn_end <= gfn_start)
> +		return -EINVAL;
> +
> +	/* lets make sure that gpa exist in our memslot */
> +	pfn_start = gfn_to_pfn(kvm, gfn_start);
> +	pfn_end = gfn_to_pfn(kvm, gfn_end);
> +
> +	if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> +		/*
> +		 * Allow guest MMIO range(s) to be added
> +		 * to the page encryption bitmap.
> +		 */
> +		return -EINVAL;
> +	}
> +
> +	if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> +		/*
> +		 * Allow guest MMIO range(s) to be added
> +		 * to the page encryption bitmap.
> +		 */
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&kvm->lock);
> +	ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> +	if (ret)
> +		goto unlock;
> +
> +	if (enc)
> +		__bitmap_set(sev->page_enc_bmap, gfn_start,
> +				gfn_end - gfn_start);
> +	else
> +		__bitmap_clear(sev->page_enc_bmap, gfn_start,
> +				gfn_end - gfn_start);
> +
> +unlock:
> +	mutex_unlock(&kvm->lock);
> +	return ret;
> +}
> +
>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>  {
>  	struct kvm_sev_cmd sev_cmd;
> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>  	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
>  
>  	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
> +
> +	.page_enc_status_hc = svm_page_enc_status_hc,
>  };
>  
>  static int __init svm_init(void)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 079d9fbf278e..f68e76ee7f9c 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.nested_get_evmcs_version = NULL,
>  	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
>  	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> +	.page_enc_status_hc = NULL,
>  };
>  
>  static void vmx_cleanup_l1d_flush(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cf95c36cb4f4..68428eef2dde 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>  		kvm_sched_yield(vcpu->kvm, a0);
>  		ret = 0;
>  		break;
> +	case KVM_HC_PAGE_ENC_STATUS:
> +		ret = -KVM_ENOSYS;
> +		if (kvm_x86_ops->page_enc_status_hc)
> +			ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> +					a0, a1, a2);
> +		break;
>  	default:
>  		ret = -KVM_ENOSYS;
>  		break;
> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> index 8b86609849b9..847b83b75dc8 100644
> --- a/include/uapi/linux/kvm_para.h
> +++ b/include/uapi/linux/kvm_para.h
> @@ -29,6 +29,7 @@
>  #define KVM_HC_CLOCK_PAIRING		9
>  #define KVM_HC_SEND_IPI		10
>  #define KVM_HC_SCHED_YIELD		11
> +#define KVM_HC_PAGE_ENC_STATUS		12
>  
>  /*
>   * hypercalls use architecture specific
> -- 
> 2.17.1
>
Krish Sadhukhan April 3, 2020, 1:31 a.m. UTC | #2
On 3/29/20 11:22 PM, Ashish Kalra wrote:
> From: Brijesh Singh <Brijesh.Singh@amd.com>
>
> This hypercall is used by the SEV guest to notify a change in the page
> encryption status to the hypervisor. The hypercall should be invoked
> only when the encryption attribute is changed from encrypted -> decrypted
> and vice versa. By default all guest pages are considered encrypted.
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: x86@kernel.org
> Cc: kvm@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> ---
>   Documentation/virt/kvm/hypercalls.rst | 15 +++++
>   arch/x86/include/asm/kvm_host.h       |  2 +
>   arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
>   arch/x86/kvm/vmx/vmx.c                |  1 +
>   arch/x86/kvm/x86.c                    |  6 ++
>   include/uapi/linux/kvm_para.h         |  1 +
>   6 files changed, 120 insertions(+)
>
> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> index dbaf207e560d..ff5287e68e81 100644
> --- a/Documentation/virt/kvm/hypercalls.rst
> +++ b/Documentation/virt/kvm/hypercalls.rst
> @@ -169,3 +169,18 @@ a0: destination APIC ID
>   
>   :Usage example: When sending a call-function IPI-many to vCPUs, yield if
>   	        any of the IPI target vCPUs was preempted.
> +
> +
> +8. KVM_HC_PAGE_ENC_STATUS
> +-------------------------
> +:Architecture: x86
> +:Status: active
> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> +
> +a0: the guest physical address of the start page
> +a1: the number of pages
> +a2: encryption attribute
> +
> +   Where:
> +	* 1: Encryption attribute is set
> +	* 0: Encryption attribute is cleared
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 98959e8cd448..90718fa3db47 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
>   
>   	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
>   	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> +	int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> +				  unsigned long sz, unsigned long mode);
>   };
>   
>   struct kvm_arch_async_pf {
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 7c2721e18b06..1d8beaf1bceb 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -136,6 +136,8 @@ struct kvm_sev_info {
>   	int fd;			/* SEV device fd */
>   	unsigned long pages_locked; /* Number of pages locked */
>   	struct list_head regions_list;  /* List of registered regions */
> +	unsigned long *page_enc_bmap;
> +	unsigned long page_enc_bmap_size;
>   };
>   
>   struct kvm_svm {
> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
>   
>   	sev_unbind_asid(kvm, sev->handle);
>   	sev_asid_free(sev->asid);
> +
> +	kvfree(sev->page_enc_bmap);
> +	sev->page_enc_bmap = NULL;
>   }
>   
>   static void avic_vm_destroy(struct kvm *kvm)
> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
>   	return ret;
>   }
>   
> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> +{
> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +	unsigned long *map;
> +	unsigned long sz;
> +
> +	if (sev->page_enc_bmap_size >= new_size)
> +		return 0;
> +
> +	sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> +
> +	map = vmalloc(sz);


Just wondering why we can't directly modify sev->page_enc_bmap.

> +	if (!map) {
> +		pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> +				sz);
> +		return -ENOMEM;
> +	}
> +
> +	/* mark the page encrypted (by default) */
> +	memset(map, 0xff, sz);
> +
> +	bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> +	kvfree(sev->page_enc_bmap);
> +
> +	sev->page_enc_bmap = map;
> +	sev->page_enc_bmap_size = new_size;
> +
> +	return 0;
> +}
> +
> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> +				  unsigned long npages, unsigned long enc)
> +{
> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +	kvm_pfn_t pfn_start, pfn_end;
> +	gfn_t gfn_start, gfn_end;
> +	int ret;
> +
> +	if (!sev_guest(kvm))
> +		return -EINVAL;
> +
> +	if (!npages)
> +		return 0;
> +
> +	gfn_start = gpa_to_gfn(gpa);
> +	gfn_end = gfn_start + npages;
> +
> +	/* out of bound access error check */
> +	if (gfn_end <= gfn_start)
> +		return -EINVAL;
> +
> +	/* lets make sure that gpa exist in our memslot */
> +	pfn_start = gfn_to_pfn(kvm, gfn_start);
> +	pfn_end = gfn_to_pfn(kvm, gfn_end);
> +
> +	if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> +		/*
> +		 * Allow guest MMIO range(s) to be added
> +		 * to the page encryption bitmap.
> +		 */
> +		return -EINVAL;
> +	}
> +
> +	if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> +		/*
> +		 * Allow guest MMIO range(s) to be added
> +		 * to the page encryption bitmap.
> +		 */
> +		return -EINVAL;
> +	}


It seems is_error_noslot_pfn() covers both cases - i) gfn slot is 
absent, ii) failure to translate to pfn. So do we still need 
is_noslot_pfn() ?

> +
> +	mutex_lock(&kvm->lock);
> +	ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> +	if (ret)
> +		goto unlock;
> +
> +	if (enc)
> +		__bitmap_set(sev->page_enc_bmap, gfn_start,
> +				gfn_end - gfn_start);
> +	else
> +		__bitmap_clear(sev->page_enc_bmap, gfn_start,
> +				gfn_end - gfn_start);
> +
> +unlock:
> +	mutex_unlock(&kvm->lock);
> +	return ret;
> +}
> +
>   static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>   {
>   	struct kvm_sev_cmd sev_cmd;
> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>   	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
>   
>   	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
> +
> +	.page_enc_status_hc = svm_page_enc_status_hc,


Why not place it where other encryption ops are located ?

         ...

         .mem_enc_unreg_region

+      .page_enc_status_hc = svm_page_enc_status_hc

>   };
>   
>   static int __init svm_init(void)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 079d9fbf278e..f68e76ee7f9c 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>   	.nested_get_evmcs_version = NULL,
>   	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
>   	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> +	.page_enc_status_hc = NULL,
>   };
>   
>   static void vmx_cleanup_l1d_flush(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cf95c36cb4f4..68428eef2dde 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>   		kvm_sched_yield(vcpu->kvm, a0);
>   		ret = 0;
>   		break;
> +	case KVM_HC_PAGE_ENC_STATUS:
> +		ret = -KVM_ENOSYS;
> +		if (kvm_x86_ops->page_enc_status_hc)
> +			ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> +					a0, a1, a2);
> +		break;
>   	default:
>   		ret = -KVM_ENOSYS;
>   		break;
> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> index 8b86609849b9..847b83b75dc8 100644
> --- a/include/uapi/linux/kvm_para.h
> +++ b/include/uapi/linux/kvm_para.h
> @@ -29,6 +29,7 @@
>   #define KVM_HC_CLOCK_PAIRING		9
>   #define KVM_HC_SEND_IPI		10
>   #define KVM_HC_SCHED_YIELD		11
> +#define KVM_HC_PAGE_ENC_STATUS		12
>   
>   /*
>    * hypercalls use architecture specific
Kalra, Ashish April 3, 2020, 1:57 a.m. UTC | #3
On Thu, Apr 02, 2020 at 06:31:54PM -0700, Krish Sadhukhan wrote:
> 
> On 3/29/20 11:22 PM, Ashish Kalra wrote:
> > From: Brijesh Singh <Brijesh.Singh@amd.com>
> > 
> > This hypercall is used by the SEV guest to notify a change in the page
> > encryption status to the hypervisor. The hypercall should be invoked
> > only when the encryption attribute is changed from encrypted -> decrypted
> > and vice versa. By default all guest pages are considered encrypted.
> > 
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > Cc: Joerg Roedel <joro@8bytes.org>
> > Cc: Borislav Petkov <bp@suse.de>
> > Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > Cc: x86@kernel.org
> > Cc: kvm@vger.kernel.org
> > Cc: linux-kernel@vger.kernel.org
> > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > ---
> >   Documentation/virt/kvm/hypercalls.rst | 15 +++++
> >   arch/x86/include/asm/kvm_host.h       |  2 +
> >   arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> >   arch/x86/kvm/vmx/vmx.c                |  1 +
> >   arch/x86/kvm/x86.c                    |  6 ++
> >   include/uapi/linux/kvm_para.h         |  1 +
> >   6 files changed, 120 insertions(+)
> > 
> > diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > index dbaf207e560d..ff5287e68e81 100644
> > --- a/Documentation/virt/kvm/hypercalls.rst
> > +++ b/Documentation/virt/kvm/hypercalls.rst
> > @@ -169,3 +169,18 @@ a0: destination APIC ID
> >   :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> >   	        any of the IPI target vCPUs was preempted.
> > +
> > +
> > +8. KVM_HC_PAGE_ENC_STATUS
> > +-------------------------
> > +:Architecture: x86
> > +:Status: active
> > +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > +
> > +a0: the guest physical address of the start page
> > +a1: the number of pages
> > +a2: encryption attribute
> > +
> > +   Where:
> > +	* 1: Encryption attribute is set
> > +	* 0: Encryption attribute is cleared
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 98959e8cd448..90718fa3db47 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> >   	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> >   	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> > +	int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > +				  unsigned long sz, unsigned long mode);
> >   };
> >   struct kvm_arch_async_pf {
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 7c2721e18b06..1d8beaf1bceb 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -136,6 +136,8 @@ struct kvm_sev_info {
> >   	int fd;			/* SEV device fd */
> >   	unsigned long pages_locked; /* Number of pages locked */
> >   	struct list_head regions_list;  /* List of registered regions */
> > +	unsigned long *page_enc_bmap;
> > +	unsigned long page_enc_bmap_size;
> >   };
> >   struct kvm_svm {
> > @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> >   	sev_unbind_asid(kvm, sev->handle);
> >   	sev_asid_free(sev->asid);
> > +
> > +	kvfree(sev->page_enc_bmap);
> > +	sev->page_enc_bmap = NULL;
> >   }
> >   static void avic_vm_destroy(struct kvm *kvm)
> > @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >   	return ret;
> >   }
> > +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> > +{
> > +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > +	unsigned long *map;
> > +	unsigned long sz;
> > +
> > +	if (sev->page_enc_bmap_size >= new_size)
> > +		return 0;
> > +
> > +	sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> > +
> > +	map = vmalloc(sz);
> 
> 
> Just wondering why we can't directly modify sev->page_enc_bmap.
> 

Because the page_enc_bitmap needs to be re-sized here, it needs to be
expanded here. 

> > +	if (!map) {
> > +		pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> > +				sz);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	/* mark the page encrypted (by default) */
> > +	memset(map, 0xff, sz);
> > +
> > +	bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> > +	kvfree(sev->page_enc_bmap);
> > +
> > +	sev->page_enc_bmap = map;
> > +	sev->page_enc_bmap_size = new_size;
> > +
> > +	return 0;
> > +}
> > +
> > +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> > +				  unsigned long npages, unsigned long enc)
> > +{
> > +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > +	kvm_pfn_t pfn_start, pfn_end;
> > +	gfn_t gfn_start, gfn_end;
> > +	int ret;
> > +
> > +	if (!sev_guest(kvm))
> > +		return -EINVAL;
> > +
> > +	if (!npages)
> > +		return 0;
> > +
> > +	gfn_start = gpa_to_gfn(gpa);
> > +	gfn_end = gfn_start + npages;
> > +
> > +	/* out of bound access error check */
> > +	if (gfn_end <= gfn_start)
> > +		return -EINVAL;
> > +
> > +	/* lets make sure that gpa exist in our memslot */
> > +	pfn_start = gfn_to_pfn(kvm, gfn_start);
> > +	pfn_end = gfn_to_pfn(kvm, gfn_end);
> > +
> > +	if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> > +		/*
> > +		 * Allow guest MMIO range(s) to be added
> > +		 * to the page encryption bitmap.
> > +		 */
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> > +		/*
> > +		 * Allow guest MMIO range(s) to be added
> > +		 * to the page encryption bitmap.
> > +		 */
> > +		return -EINVAL;
> > +	}
> 
> 
> It seems is_error_noslot_pfn() covers both cases - i) gfn slot is absent,
> ii) failure to translate to pfn. So do we still need is_noslot_pfn() ?
>

We do need to check for !is_noslot_pfn(..) additionally as the MMIO ranges will not
be having a slot allocated.

Thanks,
Ashish

> > +
> > +	mutex_lock(&kvm->lock);
> > +	ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> > +	if (ret)
> > +		goto unlock;
> > +
> > +	if (enc)
> > +		__bitmap_set(sev->page_enc_bmap, gfn_start,
> > +				gfn_end - gfn_start);
> > +	else
> > +		__bitmap_clear(sev->page_enc_bmap, gfn_start,
> > +				gfn_end - gfn_start);
> > +
> > +unlock:
> > +	mutex_unlock(&kvm->lock);
> > +	return ret;
> > +}
> > +
> >   static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> >   {
> >   	struct kvm_sev_cmd sev_cmd;
> > @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> >   	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> >   	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
> > +
> > +	.page_enc_status_hc = svm_page_enc_status_hc,
> 
> 
> Why not place it where other encryption ops are located ?
> 
>         ...
> 
>         .mem_enc_unreg_region
> 
> +      .page_enc_status_hc = svm_page_enc_status_hc
> 
> >   };
> >   static int __init svm_init(void)
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 079d9fbf278e..f68e76ee7f9c 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> >   	.nested_get_evmcs_version = NULL,
> >   	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> >   	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> > +	.page_enc_status_hc = NULL,
> >   };
> >   static void vmx_cleanup_l1d_flush(void)
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index cf95c36cb4f4..68428eef2dde 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >   		kvm_sched_yield(vcpu->kvm, a0);
> >   		ret = 0;
> >   		break;
> > +	case KVM_HC_PAGE_ENC_STATUS:
> > +		ret = -KVM_ENOSYS;
> > +		if (kvm_x86_ops->page_enc_status_hc)
> > +			ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> > +					a0, a1, a2);
> > +		break;
> >   	default:
> >   		ret = -KVM_ENOSYS;
> >   		break;
> > diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> > index 8b86609849b9..847b83b75dc8 100644
> > --- a/include/uapi/linux/kvm_para.h
> > +++ b/include/uapi/linux/kvm_para.h
> > @@ -29,6 +29,7 @@
> >   #define KVM_HC_CLOCK_PAIRING		9
> >   #define KVM_HC_SEND_IPI		10
> >   #define KVM_HC_SCHED_YIELD		11
> > +#define KVM_HC_PAGE_ENC_STATUS		12
> >   /*
> >    * hypercalls use architecture specific
Kalra, Ashish April 3, 2020, 2:58 a.m. UTC | #4
On Fri, Apr 03, 2020 at 01:57:48AM +0000, Ashish Kalra wrote:
> On Thu, Apr 02, 2020 at 06:31:54PM -0700, Krish Sadhukhan wrote:
> > 
> > On 3/29/20 11:22 PM, Ashish Kalra wrote:
> > > From: Brijesh Singh <Brijesh.Singh@amd.com>
> > > 
> > > This hypercall is used by the SEV guest to notify a change in the page
> > > encryption status to the hypervisor. The hypercall should be invoked
> > > only when the encryption attribute is changed from encrypted -> decrypted
> > > and vice versa. By default all guest pages are considered encrypted.
> > > 
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Ingo Molnar <mingo@redhat.com>
> > > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > > Cc: Joerg Roedel <joro@8bytes.org>
> > > Cc: Borislav Petkov <bp@suse.de>
> > > Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > > Cc: x86@kernel.org
> > > Cc: kvm@vger.kernel.org
> > > Cc: linux-kernel@vger.kernel.org
> > > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > > ---
> > >   Documentation/virt/kvm/hypercalls.rst | 15 +++++
> > >   arch/x86/include/asm/kvm_host.h       |  2 +
> > >   arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> > >   arch/x86/kvm/vmx/vmx.c                |  1 +
> > >   arch/x86/kvm/x86.c                    |  6 ++
> > >   include/uapi/linux/kvm_para.h         |  1 +
> > >   6 files changed, 120 insertions(+)
> > > 
> > > diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > > index dbaf207e560d..ff5287e68e81 100644
> > > --- a/Documentation/virt/kvm/hypercalls.rst
> > > +++ b/Documentation/virt/kvm/hypercalls.rst
> > > @@ -169,3 +169,18 @@ a0: destination APIC ID
> > >   :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> > >   	        any of the IPI target vCPUs was preempted.
> > > +
> > > +
> > > +8. KVM_HC_PAGE_ENC_STATUS
> > > +-------------------------
> > > +:Architecture: x86
> > > +:Status: active
> > > +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > > +
> > > +a0: the guest physical address of the start page
> > > +a1: the number of pages
> > > +a2: encryption attribute
> > > +
> > > +   Where:
> > > +	* 1: Encryption attribute is set
> > > +	* 0: Encryption attribute is cleared
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 98959e8cd448..90718fa3db47 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> > >   	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> > >   	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> > > +	int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > > +				  unsigned long sz, unsigned long mode);
> > >   };
> > >   struct kvm_arch_async_pf {
> > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > > index 7c2721e18b06..1d8beaf1bceb 100644
> > > --- a/arch/x86/kvm/svm.c
> > > +++ b/arch/x86/kvm/svm.c
> > > @@ -136,6 +136,8 @@ struct kvm_sev_info {
> > >   	int fd;			/* SEV device fd */
> > >   	unsigned long pages_locked; /* Number of pages locked */
> > >   	struct list_head regions_list;  /* List of registered regions */
> > > +	unsigned long *page_enc_bmap;
> > > +	unsigned long page_enc_bmap_size;
> > >   };
> > >   struct kvm_svm {
> > > @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> > >   	sev_unbind_asid(kvm, sev->handle);
> > >   	sev_asid_free(sev->asid);
> > > +
> > > +	kvfree(sev->page_enc_bmap);
> > > +	sev->page_enc_bmap = NULL;
> > >   }
> > >   static void avic_vm_destroy(struct kvm *kvm)
> > > @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> > >   	return ret;
> > >   }
> > > +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> > > +{
> > > +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > > +	unsigned long *map;
> > > +	unsigned long sz;
> > > +
> > > +	if (sev->page_enc_bmap_size >= new_size)
> > > +		return 0;
> > > +
> > > +	sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> > > +
> > > +	map = vmalloc(sz);
> > 
> > 
> > Just wondering why we can't directly modify sev->page_enc_bmap.
> > 
> 
> Because the page_enc_bitmap needs to be re-sized here, it needs to be
> expanded here. 
> 

I don't believe there is anything is like a realloc() kind of equivalent
for the kmalloc() interfaces.

Thanks,
Ashish

> > > +	if (!map) {
> > > +		pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> > > +				sz);
> > > +		return -ENOMEM;
> > > +	}
> > > +
> > > +	/* mark the page encrypted (by default) */
> > > +	memset(map, 0xff, sz);
> > > +
> > > +	bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> > > +	kvfree(sev->page_enc_bmap);
> > > +
> > > +	sev->page_enc_bmap = map;
> > > +	sev->page_enc_bmap_size = new_size;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> > > +				  unsigned long npages, unsigned long enc)
> > > +{
> > > +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > > +	kvm_pfn_t pfn_start, pfn_end;
> > > +	gfn_t gfn_start, gfn_end;
> > > +	int ret;
> > > +
> > > +	if (!sev_guest(kvm))
> > > +		return -EINVAL;
> > > +
> > > +	if (!npages)
> > > +		return 0;
> > > +
> > > +	gfn_start = gpa_to_gfn(gpa);
> > > +	gfn_end = gfn_start + npages;
> > > +
> > > +	/* out of bound access error check */
> > > +	if (gfn_end <= gfn_start)
> > > +		return -EINVAL;
> > > +
> > > +	/* lets make sure that gpa exist in our memslot */
> > > +	pfn_start = gfn_to_pfn(kvm, gfn_start);
> > > +	pfn_end = gfn_to_pfn(kvm, gfn_end);
> > > +
> > > +	if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> > > +		/*
> > > +		 * Allow guest MMIO range(s) to be added
> > > +		 * to the page encryption bitmap.
> > > +		 */
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> > > +		/*
> > > +		 * Allow guest MMIO range(s) to be added
> > > +		 * to the page encryption bitmap.
> > > +		 */
> > > +		return -EINVAL;
> > > +	}
> > 
> > 
> > It seems is_error_noslot_pfn() covers both cases - i) gfn slot is absent,
> > ii) failure to translate to pfn. So do we still need is_noslot_pfn() ?
> >
> 
> We do need to check for !is_noslot_pfn(..) additionally as the MMIO ranges will not
> be having a slot allocated.
> 
> Thanks,
> Ashish
> 
> > > +
> > > +	mutex_lock(&kvm->lock);
> > > +	ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> > > +	if (ret)
> > > +		goto unlock;
> > > +
> > > +	if (enc)
> > > +		__bitmap_set(sev->page_enc_bmap, gfn_start,
> > > +				gfn_end - gfn_start);
> > > +	else
> > > +		__bitmap_clear(sev->page_enc_bmap, gfn_start,
> > > +				gfn_end - gfn_start);
> > > +
> > > +unlock:
> > > +	mutex_unlock(&kvm->lock);
> > > +	return ret;
> > > +}
> > > +
> > >   static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> > >   {
> > >   	struct kvm_sev_cmd sev_cmd;
> > > @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> > >   	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> > >   	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
> > > +
> > > +	.page_enc_status_hc = svm_page_enc_status_hc,
> > 
> > 
> > Why not place it where other encryption ops are located ?
> > 
> >         ...
> > 
> >         .mem_enc_unreg_region
> > 
> > +      .page_enc_status_hc = svm_page_enc_status_hc
> > 
> > >   };
> > >   static int __init svm_init(void)
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > index 079d9fbf278e..f68e76ee7f9c 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> > >   	.nested_get_evmcs_version = NULL,
> > >   	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> > >   	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> > > +	.page_enc_status_hc = NULL,
> > >   };
> > >   static void vmx_cleanup_l1d_flush(void)
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index cf95c36cb4f4..68428eef2dde 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> > >   		kvm_sched_yield(vcpu->kvm, a0);
> > >   		ret = 0;
> > >   		break;
> > > +	case KVM_HC_PAGE_ENC_STATUS:
> > > +		ret = -KVM_ENOSYS;
> > > +		if (kvm_x86_ops->page_enc_status_hc)
> > > +			ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> > > +					a0, a1, a2);
> > > +		break;
> > >   	default:
> > >   		ret = -KVM_ENOSYS;
> > >   		break;
> > > diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> > > index 8b86609849b9..847b83b75dc8 100644
> > > --- a/include/uapi/linux/kvm_para.h
> > > +++ b/include/uapi/linux/kvm_para.h
> > > @@ -29,6 +29,7 @@
> > >   #define KVM_HC_CLOCK_PAIRING		9
> > >   #define KVM_HC_SEND_IPI		10
> > >   #define KVM_HC_SCHED_YIELD		11
> > > +#define KVM_HC_PAGE_ENC_STATUS		12
> > >   /*
> > >    * hypercalls use architecture specific
Krish Sadhukhan April 6, 2020, 10:27 p.m. UTC | #5
On 4/2/20 7:58 PM, Ashish Kalra wrote:
> On Fri, Apr 03, 2020 at 01:57:48AM +0000, Ashish Kalra wrote:
>> On Thu, Apr 02, 2020 at 06:31:54PM -0700, Krish Sadhukhan wrote:
>>> On 3/29/20 11:22 PM, Ashish Kalra wrote:
>>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
>>>>
>>>> This hypercall is used by the SEV guest to notify a change in the page
>>>> encryption status to the hypervisor. The hypercall should be invoked
>>>> only when the encryption attribute is changed from encrypted -> decrypted
>>>> and vice versa. By default all guest pages are considered encrypted.
>>>>
>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>>> Cc: Ingo Molnar <mingo@redhat.com>
>>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
>>>> Cc: Joerg Roedel <joro@8bytes.org>
>>>> Cc: Borislav Petkov <bp@suse.de>
>>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
>>>> Cc: x86@kernel.org
>>>> Cc: kvm@vger.kernel.org
>>>> Cc: linux-kernel@vger.kernel.org
>>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
>>>> ---
>>>>    Documentation/virt/kvm/hypercalls.rst | 15 +++++
>>>>    arch/x86/include/asm/kvm_host.h       |  2 +
>>>>    arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
>>>>    arch/x86/kvm/vmx/vmx.c                |  1 +
>>>>    arch/x86/kvm/x86.c                    |  6 ++
>>>>    include/uapi/linux/kvm_para.h         |  1 +
>>>>    6 files changed, 120 insertions(+)
>>>>
>>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
>>>> index dbaf207e560d..ff5287e68e81 100644
>>>> --- a/Documentation/virt/kvm/hypercalls.rst
>>>> +++ b/Documentation/virt/kvm/hypercalls.rst
>>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
>>>>    :Usage example: When sending a call-function IPI-many to vCPUs, yield if
>>>>    	        any of the IPI target vCPUs was preempted.
>>>> +
>>>> +
>>>> +8. KVM_HC_PAGE_ENC_STATUS
>>>> +-------------------------
>>>> +:Architecture: x86
>>>> +:Status: active
>>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
>>>> +
>>>> +a0: the guest physical address of the start page
>>>> +a1: the number of pages
>>>> +a2: encryption attribute
>>>> +
>>>> +   Where:
>>>> +	* 1: Encryption attribute is set
>>>> +	* 0: Encryption attribute is cleared
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>> index 98959e8cd448..90718fa3db47 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
>>>>    	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
>>>>    	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
>>>> +	int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
>>>> +				  unsigned long sz, unsigned long mode);
>>>>    };
>>>>    struct kvm_arch_async_pf {
>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>> index 7c2721e18b06..1d8beaf1bceb 100644
>>>> --- a/arch/x86/kvm/svm.c
>>>> +++ b/arch/x86/kvm/svm.c
>>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
>>>>    	int fd;			/* SEV device fd */
>>>>    	unsigned long pages_locked; /* Number of pages locked */
>>>>    	struct list_head regions_list;  /* List of registered regions */
>>>> +	unsigned long *page_enc_bmap;
>>>> +	unsigned long page_enc_bmap_size;
>>>>    };
>>>>    struct kvm_svm {
>>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
>>>>    	sev_unbind_asid(kvm, sev->handle);
>>>>    	sev_asid_free(sev->asid);
>>>> +
>>>> +	kvfree(sev->page_enc_bmap);
>>>> +	sev->page_enc_bmap = NULL;
>>>>    }
>>>>    static void avic_vm_destroy(struct kvm *kvm)
>>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>>    	return ret;
>>>>    }
>>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
>>>> +{
>>>> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>> +	unsigned long *map;
>>>> +	unsigned long sz;
>>>> +
>>>> +	if (sev->page_enc_bmap_size >= new_size)
>>>> +		return 0;
>>>> +
>>>> +	sz = ALIGN(new_size, BITS_PER_LONG) / 8;
>>>> +
>>>> +	map = vmalloc(sz);
>>>
>>> Just wondering why we can't directly modify sev->page_enc_bmap.
>>>
>> Because the page_enc_bitmap needs to be re-sized here, it needs to be
>> expanded here.


OK.

> I don't believe there is anything is like a realloc() kind of equivalent
> for the kmalloc() interfaces.
>
> Thanks,
> Ashish
>
>>>> +	if (!map) {
>>>> +		pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
>>>> +				sz);
>>>> +		return -ENOMEM;
>>>> +	}
>>>> +
>>>> +	/* mark the page encrypted (by default) */
>>>> +	memset(map, 0xff, sz);
>>>> +
>>>> +	bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
>>>> +	kvfree(sev->page_enc_bmap);
>>>> +
>>>> +	sev->page_enc_bmap = map;
>>>> +	sev->page_enc_bmap_size = new_size;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
>>>> +				  unsigned long npages, unsigned long enc)
>>>> +{
>>>> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>> +	kvm_pfn_t pfn_start, pfn_end;
>>>> +	gfn_t gfn_start, gfn_end;
>>>> +	int ret;
>>>> +
>>>> +	if (!sev_guest(kvm))
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (!npages)
>>>> +		return 0;
>>>> +
>>>> +	gfn_start = gpa_to_gfn(gpa);
>>>> +	gfn_end = gfn_start + npages;
>>>> +
>>>> +	/* out of bound access error check */
>>>> +	if (gfn_end <= gfn_start)
>>>> +		return -EINVAL;
>>>> +
>>>> +	/* lets make sure that gpa exist in our memslot */
>>>> +	pfn_start = gfn_to_pfn(kvm, gfn_start);
>>>> +	pfn_end = gfn_to_pfn(kvm, gfn_end);
>>>> +
>>>> +	if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
>>>> +		/*
>>>> +		 * Allow guest MMIO range(s) to be added
>>>> +		 * to the page encryption bitmap.
>>>> +		 */
>>>> +		return -EINVAL;
>>>> +	}
>>>> +
>>>> +	if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
>>>> +		/*
>>>> +		 * Allow guest MMIO range(s) to be added
>>>> +		 * to the page encryption bitmap.
>>>> +		 */
>>>> +		return -EINVAL;
>>>> +	}
>>>
>>> It seems is_error_noslot_pfn() covers both cases - i) gfn slot is absent,
>>> ii) failure to translate to pfn. So do we still need is_noslot_pfn() ?
>>>
>> We do need to check for !is_noslot_pfn(..) additionally as the MMIO ranges will not
>> be having a slot allocated.


The comments above is_error_noslot_pfn() seem to indicate that it covers 
both cases, "not in slot" and "failure to translate"...


Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>

>>
>> Thanks,
>> Ashish
>>
>>>> +
>>>> +	mutex_lock(&kvm->lock);
>>>> +	ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
>>>> +	if (ret)
>>>> +		goto unlock;
>>>> +
>>>> +	if (enc)
>>>> +		__bitmap_set(sev->page_enc_bmap, gfn_start,
>>>> +				gfn_end - gfn_start);
>>>> +	else
>>>> +		__bitmap_clear(sev->page_enc_bmap, gfn_start,
>>>> +				gfn_end - gfn_start);
>>>> +
>>>> +unlock:
>>>> +	mutex_unlock(&kvm->lock);
>>>> +	return ret;
>>>> +}
>>>> +
>>>>    static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>>    {
>>>>    	struct kvm_sev_cmd sev_cmd;
>>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>>>>    	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
>>>>    	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
>>>> +
>>>> +	.page_enc_status_hc = svm_page_enc_status_hc,
>>>
>>> Why not place it where other encryption ops are located ?
>>>
>>>          ...
>>>
>>>          .mem_enc_unreg_region
>>>
>>> +      .page_enc_status_hc = svm_page_enc_status_hc
>>>
>>>>    };
>>>>    static int __init svm_init(void)
>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>>> index 079d9fbf278e..f68e76ee7f9c 100644
>>>> --- a/arch/x86/kvm/vmx/vmx.c
>>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>>>>    	.nested_get_evmcs_version = NULL,
>>>>    	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
>>>>    	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
>>>> +	.page_enc_status_hc = NULL,
>>>>    };
>>>>    static void vmx_cleanup_l1d_flush(void)
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index cf95c36cb4f4..68428eef2dde 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>>>>    		kvm_sched_yield(vcpu->kvm, a0);
>>>>    		ret = 0;
>>>>    		break;
>>>> +	case KVM_HC_PAGE_ENC_STATUS:
>>>> +		ret = -KVM_ENOSYS;
>>>> +		if (kvm_x86_ops->page_enc_status_hc)
>>>> +			ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
>>>> +					a0, a1, a2);
>>>> +		break;
>>>>    	default:
>>>>    		ret = -KVM_ENOSYS;
>>>>    		break;
>>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
>>>> index 8b86609849b9..847b83b75dc8 100644
>>>> --- a/include/uapi/linux/kvm_para.h
>>>> +++ b/include/uapi/linux/kvm_para.h
>>>> @@ -29,6 +29,7 @@
>>>>    #define KVM_HC_CLOCK_PAIRING		9
>>>>    #define KVM_HC_SEND_IPI		10
>>>>    #define KVM_HC_SCHED_YIELD		11
>>>> +#define KVM_HC_PAGE_ENC_STATUS		12
>>>>    /*
>>>>     * hypercalls use architecture specific
Steve Rutherford April 7, 2020, 2:17 a.m. UTC | #6
On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
>
> From: Brijesh Singh <Brijesh.Singh@amd.com>
>
> This hypercall is used by the SEV guest to notify a change in the page
> encryption status to the hypervisor. The hypercall should be invoked
> only when the encryption attribute is changed from encrypted -> decrypted
> and vice versa. By default all guest pages are considered encrypted.
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: x86@kernel.org
> Cc: kvm@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> ---
>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
>  arch/x86/include/asm/kvm_host.h       |  2 +
>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
>  arch/x86/kvm/vmx/vmx.c                |  1 +
>  arch/x86/kvm/x86.c                    |  6 ++
>  include/uapi/linux/kvm_para.h         |  1 +
>  6 files changed, 120 insertions(+)
>
> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> index dbaf207e560d..ff5287e68e81 100644
> --- a/Documentation/virt/kvm/hypercalls.rst
> +++ b/Documentation/virt/kvm/hypercalls.rst
> @@ -169,3 +169,18 @@ a0: destination APIC ID
>
>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
>                 any of the IPI target vCPUs was preempted.
> +
> +
> +8. KVM_HC_PAGE_ENC_STATUS
> +-------------------------
> +:Architecture: x86
> +:Status: active
> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> +
> +a0: the guest physical address of the start page
> +a1: the number of pages
> +a2: encryption attribute
> +
> +   Where:
> +       * 1: Encryption attribute is set
> +       * 0: Encryption attribute is cleared
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 98959e8cd448..90718fa3db47 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
>
>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> +                                 unsigned long sz, unsigned long mode);
Nit: spell out size instead of sz.
>  };
>
>  struct kvm_arch_async_pf {
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 7c2721e18b06..1d8beaf1bceb 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -136,6 +136,8 @@ struct kvm_sev_info {
>         int fd;                 /* SEV device fd */
>         unsigned long pages_locked; /* Number of pages locked */
>         struct list_head regions_list;  /* List of registered regions */
> +       unsigned long *page_enc_bmap;
> +       unsigned long page_enc_bmap_size;
>  };
>
>  struct kvm_svm {
> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
>
>         sev_unbind_asid(kvm, sev->handle);
>         sev_asid_free(sev->asid);
> +
> +       kvfree(sev->page_enc_bmap);
> +       sev->page_enc_bmap = NULL;
>  }
>
>  static void avic_vm_destroy(struct kvm *kvm)
> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
>         return ret;
>  }
>
> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> +{
> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +       unsigned long *map;
> +       unsigned long sz;
> +
> +       if (sev->page_enc_bmap_size >= new_size)
> +               return 0;
> +
> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> +
> +       map = vmalloc(sz);
> +       if (!map) {
> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> +                               sz);
> +               return -ENOMEM;
> +       }
> +
> +       /* mark the page encrypted (by default) */
> +       memset(map, 0xff, sz);
> +
> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> +       kvfree(sev->page_enc_bmap);
> +
> +       sev->page_enc_bmap = map;
> +       sev->page_enc_bmap_size = new_size;
> +
> +       return 0;
> +}
> +
> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> +                                 unsigned long npages, unsigned long enc)
> +{
> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +       kvm_pfn_t pfn_start, pfn_end;
> +       gfn_t gfn_start, gfn_end;
> +       int ret;
> +
> +       if (!sev_guest(kvm))
> +               return -EINVAL;
> +
> +       if (!npages)
> +               return 0;
> +
> +       gfn_start = gpa_to_gfn(gpa);
> +       gfn_end = gfn_start + npages;
> +
> +       /* out of bound access error check */
> +       if (gfn_end <= gfn_start)
> +               return -EINVAL;
> +
> +       /* lets make sure that gpa exist in our memslot */
> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> +
> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> +               /*
> +                * Allow guest MMIO range(s) to be added
> +                * to the page encryption bitmap.
> +                */
> +               return -EINVAL;
> +       }
> +
> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> +               /*
> +                * Allow guest MMIO range(s) to be added
> +                * to the page encryption bitmap.
> +                */
> +               return -EINVAL;
> +       }
> +
> +       mutex_lock(&kvm->lock);
> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> +       if (ret)
> +               goto unlock;
> +
> +       if (enc)
> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> +                               gfn_end - gfn_start);
> +       else
> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> +                               gfn_end - gfn_start);
> +
> +unlock:
> +       mutex_unlock(&kvm->lock);
> +       return ret;
> +}
> +
>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>  {
>         struct kvm_sev_cmd sev_cmd;
> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
>
>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> +
> +       .page_enc_status_hc = svm_page_enc_status_hc,
>  };
>
>  static int __init svm_init(void)
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 079d9fbf278e..f68e76ee7f9c 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>         .nested_get_evmcs_version = NULL,
>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> +       .page_enc_status_hc = NULL,
>  };
>
>  static void vmx_cleanup_l1d_flush(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cf95c36cb4f4..68428eef2dde 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>                 kvm_sched_yield(vcpu->kvm, a0);
>                 ret = 0;
>                 break;
> +       case KVM_HC_PAGE_ENC_STATUS:
> +               ret = -KVM_ENOSYS;
> +               if (kvm_x86_ops->page_enc_status_hc)
> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> +                                       a0, a1, a2);
> +               break;
>         default:
>                 ret = -KVM_ENOSYS;
>                 break;
> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> index 8b86609849b9..847b83b75dc8 100644
> --- a/include/uapi/linux/kvm_para.h
> +++ b/include/uapi/linux/kvm_para.h
> @@ -29,6 +29,7 @@
>  #define KVM_HC_CLOCK_PAIRING           9
>  #define KVM_HC_SEND_IPI                10
>  #define KVM_HC_SCHED_YIELD             11
> +#define KVM_HC_PAGE_ENC_STATUS         12
>
>  /*
>   * hypercalls use architecture specific
> --
> 2.17.1
>

I'm still not excited by the dynamic resizing. I believe the guest
hypercall can be called in atomic contexts, which makes me
particularly unexcited to see a potentially large vmalloc on the host
followed by filling the buffer. Particularly when the buffer might be
non-trivial in size (~1MB per 32GB, per some back of the envelope
math).

I'd like to see an enable cap for preallocating this. Yes, the first
call might not be the right value because of hotplug, but won't the
VMM know when hotplug is happening? If the VMM asks for the wrong
size, and does not update the size correctly before granting the VM
access to more RAM, that seems like the VMM's fault.
Kalra, Ashish April 7, 2020, 5:27 a.m. UTC | #7
Hello Steve,

On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> >
> > From: Brijesh Singh <Brijesh.Singh@amd.com>
> >
> > This hypercall is used by the SEV guest to notify a change in the page
> > encryption status to the hypervisor. The hypercall should be invoked
> > only when the encryption attribute is changed from encrypted -> decrypted
> > and vice versa. By default all guest pages are considered encrypted.
> >
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > Cc: Joerg Roedel <joro@8bytes.org>
> > Cc: Borislav Petkov <bp@suse.de>
> > Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > Cc: x86@kernel.org
> > Cc: kvm@vger.kernel.org
> > Cc: linux-kernel@vger.kernel.org
> > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > ---
> >  Documentation/virt/kvm/hypercalls.rst | 15 +++++
> >  arch/x86/include/asm/kvm_host.h       |  2 +
> >  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> >  arch/x86/kvm/vmx/vmx.c                |  1 +
> >  arch/x86/kvm/x86.c                    |  6 ++
> >  include/uapi/linux/kvm_para.h         |  1 +
> >  6 files changed, 120 insertions(+)
> >
> > diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > index dbaf207e560d..ff5287e68e81 100644
> > --- a/Documentation/virt/kvm/hypercalls.rst
> > +++ b/Documentation/virt/kvm/hypercalls.rst
> > @@ -169,3 +169,18 @@ a0: destination APIC ID
> >
> >  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> >                 any of the IPI target vCPUs was preempted.
> > +
> > +
> > +8. KVM_HC_PAGE_ENC_STATUS
> > +-------------------------
> > +:Architecture: x86
> > +:Status: active
> > +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > +
> > +a0: the guest physical address of the start page
> > +a1: the number of pages
> > +a2: encryption attribute
> > +
> > +   Where:
> > +       * 1: Encryption attribute is set
> > +       * 0: Encryption attribute is cleared
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 98959e8cd448..90718fa3db47 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> >
> >         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> >         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> > +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > +                                 unsigned long sz, unsigned long mode);
> Nit: spell out size instead of sz.
> >  };
> >
> >  struct kvm_arch_async_pf {
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 7c2721e18b06..1d8beaf1bceb 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -136,6 +136,8 @@ struct kvm_sev_info {
> >         int fd;                 /* SEV device fd */
> >         unsigned long pages_locked; /* Number of pages locked */
> >         struct list_head regions_list;  /* List of registered regions */
> > +       unsigned long *page_enc_bmap;
> > +       unsigned long page_enc_bmap_size;
> >  };
> >
> >  struct kvm_svm {
> > @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> >
> >         sev_unbind_asid(kvm, sev->handle);
> >         sev_asid_free(sev->asid);
> > +
> > +       kvfree(sev->page_enc_bmap);
> > +       sev->page_enc_bmap = NULL;
> >  }
> >
> >  static void avic_vm_destroy(struct kvm *kvm)
> > @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >         return ret;
> >  }
> >
> > +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> > +{
> > +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > +       unsigned long *map;
> > +       unsigned long sz;
> > +
> > +       if (sev->page_enc_bmap_size >= new_size)
> > +               return 0;
> > +
> > +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> > +
> > +       map = vmalloc(sz);
> > +       if (!map) {
> > +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> > +                               sz);
> > +               return -ENOMEM;
> > +       }
> > +
> > +       /* mark the page encrypted (by default) */
> > +       memset(map, 0xff, sz);
> > +
> > +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> > +       kvfree(sev->page_enc_bmap);
> > +
> > +       sev->page_enc_bmap = map;
> > +       sev->page_enc_bmap_size = new_size;
> > +
> > +       return 0;
> > +}
> > +
> > +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> > +                                 unsigned long npages, unsigned long enc)
> > +{
> > +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > +       kvm_pfn_t pfn_start, pfn_end;
> > +       gfn_t gfn_start, gfn_end;
> > +       int ret;
> > +
> > +       if (!sev_guest(kvm))
> > +               return -EINVAL;
> > +
> > +       if (!npages)
> > +               return 0;
> > +
> > +       gfn_start = gpa_to_gfn(gpa);
> > +       gfn_end = gfn_start + npages;
> > +
> > +       /* out of bound access error check */
> > +       if (gfn_end <= gfn_start)
> > +               return -EINVAL;
> > +
> > +       /* lets make sure that gpa exist in our memslot */
> > +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> > +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> > +
> > +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> > +               /*
> > +                * Allow guest MMIO range(s) to be added
> > +                * to the page encryption bitmap.
> > +                */
> > +               return -EINVAL;
> > +       }
> > +
> > +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> > +               /*
> > +                * Allow guest MMIO range(s) to be added
> > +                * to the page encryption bitmap.
> > +                */
> > +               return -EINVAL;
> > +       }
> > +
> > +       mutex_lock(&kvm->lock);
> > +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> > +       if (ret)
> > +               goto unlock;
> > +
> > +       if (enc)
> > +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> > +                               gfn_end - gfn_start);
> > +       else
> > +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> > +                               gfn_end - gfn_start);
> > +
> > +unlock:
> > +       mutex_unlock(&kvm->lock);
> > +       return ret;
> > +}
> > +
> >  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> >  {
> >         struct kvm_sev_cmd sev_cmd;
> > @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> >         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> >
> >         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> > +
> > +       .page_enc_status_hc = svm_page_enc_status_hc,
> >  };
> >
> >  static int __init svm_init(void)
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 079d9fbf278e..f68e76ee7f9c 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> >         .nested_get_evmcs_version = NULL,
> >         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> >         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> > +       .page_enc_status_hc = NULL,
> >  };
> >
> >  static void vmx_cleanup_l1d_flush(void)
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index cf95c36cb4f4..68428eef2dde 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >                 kvm_sched_yield(vcpu->kvm, a0);
> >                 ret = 0;
> >                 break;
> > +       case KVM_HC_PAGE_ENC_STATUS:
> > +               ret = -KVM_ENOSYS;
> > +               if (kvm_x86_ops->page_enc_status_hc)
> > +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> > +                                       a0, a1, a2);
> > +               break;
> >         default:
> >                 ret = -KVM_ENOSYS;
> >                 break;
> > diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> > index 8b86609849b9..847b83b75dc8 100644
> > --- a/include/uapi/linux/kvm_para.h
> > +++ b/include/uapi/linux/kvm_para.h
> > @@ -29,6 +29,7 @@
> >  #define KVM_HC_CLOCK_PAIRING           9
> >  #define KVM_HC_SEND_IPI                10
> >  #define KVM_HC_SCHED_YIELD             11
> > +#define KVM_HC_PAGE_ENC_STATUS         12
> >
> >  /*
> >   * hypercalls use architecture specific
> > --
> > 2.17.1
> >
> 
> I'm still not excited by the dynamic resizing. I believe the guest
> hypercall can be called in atomic contexts, which makes me
> particularly unexcited to see a potentially large vmalloc on the host
> followed by filling the buffer. Particularly when the buffer might be
> non-trivial in size (~1MB per 32GB, per some back of the envelope
> math).
>

I think looking at more practical situations, most hypercalls will 
happen during the boot stage, when device specific initializations are
happening, so typically the maximum page encryption bitmap size would
be allocated early enough.

In fact, initial hypercalls made by OVMF will probably allocate the
maximum page bitmap size even before the kernel comes up, especially
as they will be setting up page enc/dec status for MMIO, ROM, ACPI
regions, PCI device memory, etc., and most importantly for
"non-existent" high memory range (which will probably be the
maximum size page encryption bitmap allocated/resized).

Let me know if you have different thoughts on this ?

> I'd like to see an enable cap for preallocating this. Yes, the first
> call might not be the right value because of hotplug, but won't the
> VMM know when hotplug is happening? If the VMM asks for the wrong
> size, and does not update the size correctly before granting the VM
> access to more RAM, that seems like the VMM's fault.o

Thanks,
Ashish
Steve Rutherford April 8, 2020, 12:01 a.m. UTC | #8
On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
>
> Hello Steve,
>
> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
> > On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> > >
> > > From: Brijesh Singh <Brijesh.Singh@amd.com>
> > >
> > > This hypercall is used by the SEV guest to notify a change in the page
> > > encryption status to the hypervisor. The hypercall should be invoked
> > > only when the encryption attribute is changed from encrypted -> decrypted
> > > and vice versa. By default all guest pages are considered encrypted.
> > >
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Ingo Molnar <mingo@redhat.com>
> > > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > > Cc: Joerg Roedel <joro@8bytes.org>
> > > Cc: Borislav Petkov <bp@suse.de>
> > > Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > > Cc: x86@kernel.org
> > > Cc: kvm@vger.kernel.org
> > > Cc: linux-kernel@vger.kernel.org
> > > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > > ---
> > >  Documentation/virt/kvm/hypercalls.rst | 15 +++++
> > >  arch/x86/include/asm/kvm_host.h       |  2 +
> > >  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> > >  arch/x86/kvm/vmx/vmx.c                |  1 +
> > >  arch/x86/kvm/x86.c                    |  6 ++
> > >  include/uapi/linux/kvm_para.h         |  1 +
> > >  6 files changed, 120 insertions(+)
> > >
> > > diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > > index dbaf207e560d..ff5287e68e81 100644
> > > --- a/Documentation/virt/kvm/hypercalls.rst
> > > +++ b/Documentation/virt/kvm/hypercalls.rst
> > > @@ -169,3 +169,18 @@ a0: destination APIC ID
> > >
> > >  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> > >                 any of the IPI target vCPUs was preempted.
> > > +
> > > +
> > > +8. KVM_HC_PAGE_ENC_STATUS
> > > +-------------------------
> > > +:Architecture: x86
> > > +:Status: active
> > > +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > > +
> > > +a0: the guest physical address of the start page
> > > +a1: the number of pages
> > > +a2: encryption attribute
> > > +
> > > +   Where:
> > > +       * 1: Encryption attribute is set
> > > +       * 0: Encryption attribute is cleared
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 98959e8cd448..90718fa3db47 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> > >
> > >         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> > >         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> > > +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > > +                                 unsigned long sz, unsigned long mode);
> > Nit: spell out size instead of sz.
> > >  };
> > >
> > >  struct kvm_arch_async_pf {
> > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > > index 7c2721e18b06..1d8beaf1bceb 100644
> > > --- a/arch/x86/kvm/svm.c
> > > +++ b/arch/x86/kvm/svm.c
> > > @@ -136,6 +136,8 @@ struct kvm_sev_info {
> > >         int fd;                 /* SEV device fd */
> > >         unsigned long pages_locked; /* Number of pages locked */
> > >         struct list_head regions_list;  /* List of registered regions */
> > > +       unsigned long *page_enc_bmap;
> > > +       unsigned long page_enc_bmap_size;
> > >  };
> > >
> > >  struct kvm_svm {
> > > @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> > >
> > >         sev_unbind_asid(kvm, sev->handle);
> > >         sev_asid_free(sev->asid);
> > > +
> > > +       kvfree(sev->page_enc_bmap);
> > > +       sev->page_enc_bmap = NULL;
> > >  }
> > >
> > >  static void avic_vm_destroy(struct kvm *kvm)
> > > @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> > >         return ret;
> > >  }
> > >
> > > +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> > > +{
> > > +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > > +       unsigned long *map;
> > > +       unsigned long sz;
> > > +
> > > +       if (sev->page_enc_bmap_size >= new_size)
> > > +               return 0;
> > > +
> > > +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> > > +
> > > +       map = vmalloc(sz);
> > > +       if (!map) {
> > > +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> > > +                               sz);
> > > +               return -ENOMEM;
> > > +       }
> > > +
> > > +       /* mark the page encrypted (by default) */
> > > +       memset(map, 0xff, sz);
> > > +
> > > +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> > > +       kvfree(sev->page_enc_bmap);
> > > +
> > > +       sev->page_enc_bmap = map;
> > > +       sev->page_enc_bmap_size = new_size;
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> > > +                                 unsigned long npages, unsigned long enc)
> > > +{
> > > +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > > +       kvm_pfn_t pfn_start, pfn_end;
> > > +       gfn_t gfn_start, gfn_end;
> > > +       int ret;
> > > +
> > > +       if (!sev_guest(kvm))
> > > +               return -EINVAL;
> > > +
> > > +       if (!npages)
> > > +               return 0;
> > > +
> > > +       gfn_start = gpa_to_gfn(gpa);
> > > +       gfn_end = gfn_start + npages;
> > > +
> > > +       /* out of bound access error check */
> > > +       if (gfn_end <= gfn_start)
> > > +               return -EINVAL;
> > > +
> > > +       /* lets make sure that gpa exist in our memslot */
> > > +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> > > +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> > > +
> > > +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> > > +               /*
> > > +                * Allow guest MMIO range(s) to be added
> > > +                * to the page encryption bitmap.
> > > +                */
> > > +               return -EINVAL;
> > > +       }
> > > +
> > > +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> > > +               /*
> > > +                * Allow guest MMIO range(s) to be added
> > > +                * to the page encryption bitmap.
> > > +                */
> > > +               return -EINVAL;
> > > +       }
> > > +
> > > +       mutex_lock(&kvm->lock);
> > > +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> > > +       if (ret)
> > > +               goto unlock;
> > > +
> > > +       if (enc)
> > > +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> > > +                               gfn_end - gfn_start);
> > > +       else
> > > +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> > > +                               gfn_end - gfn_start);
> > > +
> > > +unlock:
> > > +       mutex_unlock(&kvm->lock);
> > > +       return ret;
> > > +}
> > > +
> > >  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> > >  {
> > >         struct kvm_sev_cmd sev_cmd;
> > > @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> > >         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> > >
> > >         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> > > +
> > > +       .page_enc_status_hc = svm_page_enc_status_hc,
> > >  };
> > >
> > >  static int __init svm_init(void)
> > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > index 079d9fbf278e..f68e76ee7f9c 100644
> > > --- a/arch/x86/kvm/vmx/vmx.c
> > > +++ b/arch/x86/kvm/vmx/vmx.c
> > > @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> > >         .nested_get_evmcs_version = NULL,
> > >         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> > >         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> > > +       .page_enc_status_hc = NULL,
> > >  };
> > >
> > >  static void vmx_cleanup_l1d_flush(void)
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index cf95c36cb4f4..68428eef2dde 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> > >                 kvm_sched_yield(vcpu->kvm, a0);
> > >                 ret = 0;
> > >                 break;
> > > +       case KVM_HC_PAGE_ENC_STATUS:
> > > +               ret = -KVM_ENOSYS;
> > > +               if (kvm_x86_ops->page_enc_status_hc)
> > > +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> > > +                                       a0, a1, a2);
> > > +               break;
> > >         default:
> > >                 ret = -KVM_ENOSYS;
> > >                 break;
> > > diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> > > index 8b86609849b9..847b83b75dc8 100644
> > > --- a/include/uapi/linux/kvm_para.h
> > > +++ b/include/uapi/linux/kvm_para.h
> > > @@ -29,6 +29,7 @@
> > >  #define KVM_HC_CLOCK_PAIRING           9
> > >  #define KVM_HC_SEND_IPI                10
> > >  #define KVM_HC_SCHED_YIELD             11
> > > +#define KVM_HC_PAGE_ENC_STATUS         12
> > >
> > >  /*
> > >   * hypercalls use architecture specific
> > > --
> > > 2.17.1
> > >
> >
> > I'm still not excited by the dynamic resizing. I believe the guest
> > hypercall can be called in atomic contexts, which makes me
> > particularly unexcited to see a potentially large vmalloc on the host
> > followed by filling the buffer. Particularly when the buffer might be
> > non-trivial in size (~1MB per 32GB, per some back of the envelope
> > math).
> >
>
> I think looking at more practical situations, most hypercalls will
> happen during the boot stage, when device specific initializations are
> happening, so typically the maximum page encryption bitmap size would
> be allocated early enough.
>
> In fact, initial hypercalls made by OVMF will probably allocate the
> maximum page bitmap size even before the kernel comes up, especially
> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
> regions, PCI device memory, etc., and most importantly for
> "non-existent" high memory range (which will probably be the
> maximum size page encryption bitmap allocated/resized).
>
> Let me know if you have different thoughts on this ?

Hi Ashish,

If this is not an issue in practice, we can just move past this. If we
are basically guaranteed that OVMF will trigger hypercalls that expand
the bitmap beyond the top of memory, then, yes, that should work. That
leaves me slightly nervous that OVMF might regress since it's not
obvious that calling a hypercall beyond the top of memory would be
"required" for avoiding a somewhat indirectly related issue in guest
kernels.

Adding a kvm_enable_cap doesn't seem particularly complicated and side
steps all of these concerns, so I still prefer it. Caveat, haven't
reviewed the patches about the feature bits yet: the enable cap would
also make it possible for kernels that support live migration to avoid
advertising live migration if host usermode does not want it to be
advertised. This seems pretty important, since hosts that don't plan
to live migrate should have the ability to tell the guest to stop
calling.

Thanks,
Steve
Brijesh Singh April 8, 2020, 12:29 a.m. UTC | #9
On 4/7/20 7:01 PM, Steve Rutherford wrote:
> On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
>> Hello Steve,
>>
>> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
>>> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
>>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
>>>>
>>>> This hypercall is used by the SEV guest to notify a change in the page
>>>> encryption status to the hypervisor. The hypercall should be invoked
>>>> only when the encryption attribute is changed from encrypted -> decrypted
>>>> and vice versa. By default all guest pages are considered encrypted.
>>>>
>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>>> Cc: Ingo Molnar <mingo@redhat.com>
>>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
>>>> Cc: Joerg Roedel <joro@8bytes.org>
>>>> Cc: Borislav Petkov <bp@suse.de>
>>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
>>>> Cc: x86@kernel.org
>>>> Cc: kvm@vger.kernel.org
>>>> Cc: linux-kernel@vger.kernel.org
>>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
>>>> ---
>>>>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
>>>>  arch/x86/include/asm/kvm_host.h       |  2 +
>>>>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
>>>>  arch/x86/kvm/vmx/vmx.c                |  1 +
>>>>  arch/x86/kvm/x86.c                    |  6 ++
>>>>  include/uapi/linux/kvm_para.h         |  1 +
>>>>  6 files changed, 120 insertions(+)
>>>>
>>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
>>>> index dbaf207e560d..ff5287e68e81 100644
>>>> --- a/Documentation/virt/kvm/hypercalls.rst
>>>> +++ b/Documentation/virt/kvm/hypercalls.rst
>>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
>>>>
>>>>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
>>>>                 any of the IPI target vCPUs was preempted.
>>>> +
>>>> +
>>>> +8. KVM_HC_PAGE_ENC_STATUS
>>>> +-------------------------
>>>> +:Architecture: x86
>>>> +:Status: active
>>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
>>>> +
>>>> +a0: the guest physical address of the start page
>>>> +a1: the number of pages
>>>> +a2: encryption attribute
>>>> +
>>>> +   Where:
>>>> +       * 1: Encryption attribute is set
>>>> +       * 0: Encryption attribute is cleared
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>> index 98959e8cd448..90718fa3db47 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
>>>>
>>>>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
>>>>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
>>>> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
>>>> +                                 unsigned long sz, unsigned long mode);
>>> Nit: spell out size instead of sz.
>>>>  };
>>>>
>>>>  struct kvm_arch_async_pf {
>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>> index 7c2721e18b06..1d8beaf1bceb 100644
>>>> --- a/arch/x86/kvm/svm.c
>>>> +++ b/arch/x86/kvm/svm.c
>>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
>>>>         int fd;                 /* SEV device fd */
>>>>         unsigned long pages_locked; /* Number of pages locked */
>>>>         struct list_head regions_list;  /* List of registered regions */
>>>> +       unsigned long *page_enc_bmap;
>>>> +       unsigned long page_enc_bmap_size;
>>>>  };
>>>>
>>>>  struct kvm_svm {
>>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
>>>>
>>>>         sev_unbind_asid(kvm, sev->handle);
>>>>         sev_asid_free(sev->asid);
>>>> +
>>>> +       kvfree(sev->page_enc_bmap);
>>>> +       sev->page_enc_bmap = NULL;
>>>>  }
>>>>
>>>>  static void avic_vm_destroy(struct kvm *kvm)
>>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>>         return ret;
>>>>  }
>>>>
>>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
>>>> +{
>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>> +       unsigned long *map;
>>>> +       unsigned long sz;
>>>> +
>>>> +       if (sev->page_enc_bmap_size >= new_size)
>>>> +               return 0;
>>>> +
>>>> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
>>>> +
>>>> +       map = vmalloc(sz);
>>>> +       if (!map) {
>>>> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
>>>> +                               sz);
>>>> +               return -ENOMEM;
>>>> +       }
>>>> +
>>>> +       /* mark the page encrypted (by default) */
>>>> +       memset(map, 0xff, sz);
>>>> +
>>>> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
>>>> +       kvfree(sev->page_enc_bmap);
>>>> +
>>>> +       sev->page_enc_bmap = map;
>>>> +       sev->page_enc_bmap_size = new_size;
>>>> +
>>>> +       return 0;
>>>> +}
>>>> +
>>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
>>>> +                                 unsigned long npages, unsigned long enc)
>>>> +{
>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>> +       kvm_pfn_t pfn_start, pfn_end;
>>>> +       gfn_t gfn_start, gfn_end;
>>>> +       int ret;
>>>> +
>>>> +       if (!sev_guest(kvm))
>>>> +               return -EINVAL;
>>>> +
>>>> +       if (!npages)
>>>> +               return 0;
>>>> +
>>>> +       gfn_start = gpa_to_gfn(gpa);
>>>> +       gfn_end = gfn_start + npages;
>>>> +
>>>> +       /* out of bound access error check */
>>>> +       if (gfn_end <= gfn_start)
>>>> +               return -EINVAL;
>>>> +
>>>> +       /* lets make sure that gpa exist in our memslot */
>>>> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
>>>> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
>>>> +
>>>> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
>>>> +               /*
>>>> +                * Allow guest MMIO range(s) to be added
>>>> +                * to the page encryption bitmap.
>>>> +                */
>>>> +               return -EINVAL;
>>>> +       }
>>>> +
>>>> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
>>>> +               /*
>>>> +                * Allow guest MMIO range(s) to be added
>>>> +                * to the page encryption bitmap.
>>>> +                */
>>>> +               return -EINVAL;
>>>> +       }
>>>> +
>>>> +       mutex_lock(&kvm->lock);
>>>> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
>>>> +       if (ret)
>>>> +               goto unlock;
>>>> +
>>>> +       if (enc)
>>>> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
>>>> +                               gfn_end - gfn_start);
>>>> +       else
>>>> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
>>>> +                               gfn_end - gfn_start);
>>>> +
>>>> +unlock:
>>>> +       mutex_unlock(&kvm->lock);
>>>> +       return ret;
>>>> +}
>>>> +
>>>>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>>  {
>>>>         struct kvm_sev_cmd sev_cmd;
>>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>>>>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
>>>>
>>>>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
>>>> +
>>>> +       .page_enc_status_hc = svm_page_enc_status_hc,
>>>>  };
>>>>
>>>>  static int __init svm_init(void)
>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>>> index 079d9fbf278e..f68e76ee7f9c 100644
>>>> --- a/arch/x86/kvm/vmx/vmx.c
>>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>>>>         .nested_get_evmcs_version = NULL,
>>>>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
>>>>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
>>>> +       .page_enc_status_hc = NULL,
>>>>  };
>>>>
>>>>  static void vmx_cleanup_l1d_flush(void)
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index cf95c36cb4f4..68428eef2dde 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>>>>                 kvm_sched_yield(vcpu->kvm, a0);
>>>>                 ret = 0;
>>>>                 break;
>>>> +       case KVM_HC_PAGE_ENC_STATUS:
>>>> +               ret = -KVM_ENOSYS;
>>>> +               if (kvm_x86_ops->page_enc_status_hc)
>>>> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
>>>> +                                       a0, a1, a2);
>>>> +               break;
>>>>         default:
>>>>                 ret = -KVM_ENOSYS;
>>>>                 break;
>>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
>>>> index 8b86609849b9..847b83b75dc8 100644
>>>> --- a/include/uapi/linux/kvm_para.h
>>>> +++ b/include/uapi/linux/kvm_para.h
>>>> @@ -29,6 +29,7 @@
>>>>  #define KVM_HC_CLOCK_PAIRING           9
>>>>  #define KVM_HC_SEND_IPI                10
>>>>  #define KVM_HC_SCHED_YIELD             11
>>>> +#define KVM_HC_PAGE_ENC_STATUS         12
>>>>
>>>>  /*
>>>>   * hypercalls use architecture specific
>>>> --
>>>> 2.17.1
>>>>
>>> I'm still not excited by the dynamic resizing. I believe the guest
>>> hypercall can be called in atomic contexts, which makes me
>>> particularly unexcited to see a potentially large vmalloc on the host
>>> followed by filling the buffer. Particularly when the buffer might be
>>> non-trivial in size (~1MB per 32GB, per some back of the envelope
>>> math).
>>>
>> I think looking at more practical situations, most hypercalls will
>> happen during the boot stage, when device specific initializations are
>> happening, so typically the maximum page encryption bitmap size would
>> be allocated early enough.
>>
>> In fact, initial hypercalls made by OVMF will probably allocate the
>> maximum page bitmap size even before the kernel comes up, especially
>> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
>> regions, PCI device memory, etc., and most importantly for
>> "non-existent" high memory range (which will probably be the
>> maximum size page encryption bitmap allocated/resized).
>>
>> Let me know if you have different thoughts on this ?
> Hi Ashish,
>
> If this is not an issue in practice, we can just move past this. If we
> are basically guaranteed that OVMF will trigger hypercalls that expand
> the bitmap beyond the top of memory, then, yes, that should work. That
> leaves me slightly nervous that OVMF might regress since it's not
> obvious that calling a hypercall beyond the top of memory would be
> "required" for avoiding a somewhat indirectly related issue in guest
> kernels.


If possible then we should try to avoid growing/shrinking the bitmap .
Today OVMF may not be accessing beyond memory but a malicious guest
could send a hypercall down which can trigger a huge memory allocation
on the host side and may eventually cause denial of service for other. 
I am in favor if we can find some solution to handle this case. How
about Steve's suggestion about VMM making a call down to the kernel to
tell how big the bitmap should be? Initially it should be equal to the
guest RAM and if VMM ever did the memory expansion then it can send down
another notification to increase the bitmap ?

Optionally, instead of adding a new ioctl, I was wondering if we can
extend the kvm_arch_prepare_memory_region() to make svm specific x86_ops
which can take read the userspace provided memory region and calculate
the amount of guest RAM managed by the KVM and grow/shrink the bitmap
based on that information. I have not looked deep enough to see if its
doable but if it can work then we can avoid adding yet another ioctl.

>
> Adding a kvm_enable_cap doesn't seem particularly complicated and side
> steps all of these concerns, so I still prefer it. Caveat, haven't
> reviewed the patches about the feature bits yet: the enable cap would
> also make it possible for kernels that support live migration to avoid
> advertising live migration if host usermode does not want it to be
> advertised. This seems pretty important, since hosts that don't plan
> to live migrate should have the ability to tell the guest to stop
> calling.
>
> Thanks,
> Steve
Steve Rutherford April 8, 2020, 12:35 a.m. UTC | #10
On Tue, Apr 7, 2020 at 5:29 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
>
>
> On 4/7/20 7:01 PM, Steve Rutherford wrote:
> > On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> >> Hello Steve,
> >>
> >> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
> >>> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> >>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
> >>>>
> >>>> This hypercall is used by the SEV guest to notify a change in the page
> >>>> encryption status to the hypervisor. The hypercall should be invoked
> >>>> only when the encryption attribute is changed from encrypted -> decrypted
> >>>> and vice versa. By default all guest pages are considered encrypted.
> >>>>
> >>>> Cc: Thomas Gleixner <tglx@linutronix.de>
> >>>> Cc: Ingo Molnar <mingo@redhat.com>
> >>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
> >>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> >>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> >>>> Cc: Joerg Roedel <joro@8bytes.org>
> >>>> Cc: Borislav Petkov <bp@suse.de>
> >>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> >>>> Cc: x86@kernel.org
> >>>> Cc: kvm@vger.kernel.org
> >>>> Cc: linux-kernel@vger.kernel.org
> >>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> >>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> >>>> ---
> >>>>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
> >>>>  arch/x86/include/asm/kvm_host.h       |  2 +
> >>>>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> >>>>  arch/x86/kvm/vmx/vmx.c                |  1 +
> >>>>  arch/x86/kvm/x86.c                    |  6 ++
> >>>>  include/uapi/linux/kvm_para.h         |  1 +
> >>>>  6 files changed, 120 insertions(+)
> >>>>
> >>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> >>>> index dbaf207e560d..ff5287e68e81 100644
> >>>> --- a/Documentation/virt/kvm/hypercalls.rst
> >>>> +++ b/Documentation/virt/kvm/hypercalls.rst
> >>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
> >>>>
> >>>>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> >>>>                 any of the IPI target vCPUs was preempted.
> >>>> +
> >>>> +
> >>>> +8. KVM_HC_PAGE_ENC_STATUS
> >>>> +-------------------------
> >>>> +:Architecture: x86
> >>>> +:Status: active
> >>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> >>>> +
> >>>> +a0: the guest physical address of the start page
> >>>> +a1: the number of pages
> >>>> +a2: encryption attribute
> >>>> +
> >>>> +   Where:
> >>>> +       * 1: Encryption attribute is set
> >>>> +       * 0: Encryption attribute is cleared
> >>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >>>> index 98959e8cd448..90718fa3db47 100644
> >>>> --- a/arch/x86/include/asm/kvm_host.h
> >>>> +++ b/arch/x86/include/asm/kvm_host.h
> >>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> >>>>
> >>>>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> >>>>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> >>>> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> >>>> +                                 unsigned long sz, unsigned long mode);
> >>> Nit: spell out size instead of sz.
> >>>>  };
> >>>>
> >>>>  struct kvm_arch_async_pf {
> >>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >>>> index 7c2721e18b06..1d8beaf1bceb 100644
> >>>> --- a/arch/x86/kvm/svm.c
> >>>> +++ b/arch/x86/kvm/svm.c
> >>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
> >>>>         int fd;                 /* SEV device fd */
> >>>>         unsigned long pages_locked; /* Number of pages locked */
> >>>>         struct list_head regions_list;  /* List of registered regions */
> >>>> +       unsigned long *page_enc_bmap;
> >>>> +       unsigned long page_enc_bmap_size;
> >>>>  };
> >>>>
> >>>>  struct kvm_svm {
> >>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> >>>>
> >>>>         sev_unbind_asid(kvm, sev->handle);
> >>>>         sev_asid_free(sev->asid);
> >>>> +
> >>>> +       kvfree(sev->page_enc_bmap);
> >>>> +       sev->page_enc_bmap = NULL;
> >>>>  }
> >>>>
> >>>>  static void avic_vm_destroy(struct kvm *kvm)
> >>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >>>>         return ret;
> >>>>  }
> >>>>
> >>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> >>>> +{
> >>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >>>> +       unsigned long *map;
> >>>> +       unsigned long sz;
> >>>> +
> >>>> +       if (sev->page_enc_bmap_size >= new_size)
> >>>> +               return 0;
> >>>> +
> >>>> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> >>>> +
> >>>> +       map = vmalloc(sz);
> >>>> +       if (!map) {
> >>>> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> >>>> +                               sz);
> >>>> +               return -ENOMEM;
> >>>> +       }
> >>>> +
> >>>> +       /* mark the page encrypted (by default) */
> >>>> +       memset(map, 0xff, sz);
> >>>> +
> >>>> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> >>>> +       kvfree(sev->page_enc_bmap);
> >>>> +
> >>>> +       sev->page_enc_bmap = map;
> >>>> +       sev->page_enc_bmap_size = new_size;
> >>>> +
> >>>> +       return 0;
> >>>> +}
> >>>> +
> >>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> >>>> +                                 unsigned long npages, unsigned long enc)
> >>>> +{
> >>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >>>> +       kvm_pfn_t pfn_start, pfn_end;
> >>>> +       gfn_t gfn_start, gfn_end;
> >>>> +       int ret;
> >>>> +
> >>>> +       if (!sev_guest(kvm))
> >>>> +               return -EINVAL;
> >>>> +
> >>>> +       if (!npages)
> >>>> +               return 0;
> >>>> +
> >>>> +       gfn_start = gpa_to_gfn(gpa);
> >>>> +       gfn_end = gfn_start + npages;
> >>>> +
> >>>> +       /* out of bound access error check */
> >>>> +       if (gfn_end <= gfn_start)
> >>>> +               return -EINVAL;
> >>>> +
> >>>> +       /* lets make sure that gpa exist in our memslot */
> >>>> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> >>>> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> >>>> +
> >>>> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> >>>> +               /*
> >>>> +                * Allow guest MMIO range(s) to be added
> >>>> +                * to the page encryption bitmap.
> >>>> +                */
> >>>> +               return -EINVAL;
> >>>> +       }
> >>>> +
> >>>> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> >>>> +               /*
> >>>> +                * Allow guest MMIO range(s) to be added
> >>>> +                * to the page encryption bitmap.
> >>>> +                */
> >>>> +               return -EINVAL;
> >>>> +       }
> >>>> +
> >>>> +       mutex_lock(&kvm->lock);
> >>>> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> >>>> +       if (ret)
> >>>> +               goto unlock;
> >>>> +
> >>>> +       if (enc)
> >>>> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> >>>> +                               gfn_end - gfn_start);
> >>>> +       else
> >>>> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> >>>> +                               gfn_end - gfn_start);
> >>>> +
> >>>> +unlock:
> >>>> +       mutex_unlock(&kvm->lock);
> >>>> +       return ret;
> >>>> +}
> >>>> +
> >>>>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> >>>>  {
> >>>>         struct kvm_sev_cmd sev_cmd;
> >>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> >>>>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> >>>>
> >>>>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> >>>> +
> >>>> +       .page_enc_status_hc = svm_page_enc_status_hc,
> >>>>  };
> >>>>
> >>>>  static int __init svm_init(void)
> >>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> >>>> index 079d9fbf278e..f68e76ee7f9c 100644
> >>>> --- a/arch/x86/kvm/vmx/vmx.c
> >>>> +++ b/arch/x86/kvm/vmx/vmx.c
> >>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> >>>>         .nested_get_evmcs_version = NULL,
> >>>>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> >>>>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> >>>> +       .page_enc_status_hc = NULL,
> >>>>  };
> >>>>
> >>>>  static void vmx_cleanup_l1d_flush(void)
> >>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>>> index cf95c36cb4f4..68428eef2dde 100644
> >>>> --- a/arch/x86/kvm/x86.c
> >>>> +++ b/arch/x86/kvm/x86.c
> >>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >>>>                 kvm_sched_yield(vcpu->kvm, a0);
> >>>>                 ret = 0;
> >>>>                 break;
> >>>> +       case KVM_HC_PAGE_ENC_STATUS:
> >>>> +               ret = -KVM_ENOSYS;
> >>>> +               if (kvm_x86_ops->page_enc_status_hc)
> >>>> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> >>>> +                                       a0, a1, a2);
> >>>> +               break;
> >>>>         default:
> >>>>                 ret = -KVM_ENOSYS;
> >>>>                 break;
> >>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> >>>> index 8b86609849b9..847b83b75dc8 100644
> >>>> --- a/include/uapi/linux/kvm_para.h
> >>>> +++ b/include/uapi/linux/kvm_para.h
> >>>> @@ -29,6 +29,7 @@
> >>>>  #define KVM_HC_CLOCK_PAIRING           9
> >>>>  #define KVM_HC_SEND_IPI                10
> >>>>  #define KVM_HC_SCHED_YIELD             11
> >>>> +#define KVM_HC_PAGE_ENC_STATUS         12
> >>>>
> >>>>  /*
> >>>>   * hypercalls use architecture specific
> >>>> --
> >>>> 2.17.1
> >>>>
> >>> I'm still not excited by the dynamic resizing. I believe the guest
> >>> hypercall can be called in atomic contexts, which makes me
> >>> particularly unexcited to see a potentially large vmalloc on the host
> >>> followed by filling the buffer. Particularly when the buffer might be
> >>> non-trivial in size (~1MB per 32GB, per some back of the envelope
> >>> math).
> >>>
> >> I think looking at more practical situations, most hypercalls will
> >> happen during the boot stage, when device specific initializations are
> >> happening, so typically the maximum page encryption bitmap size would
> >> be allocated early enough.
> >>
> >> In fact, initial hypercalls made by OVMF will probably allocate the
> >> maximum page bitmap size even before the kernel comes up, especially
> >> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
> >> regions, PCI device memory, etc., and most importantly for
> >> "non-existent" high memory range (which will probably be the
> >> maximum size page encryption bitmap allocated/resized).
> >>
> >> Let me know if you have different thoughts on this ?
> > Hi Ashish,
> >
> > If this is not an issue in practice, we can just move past this. If we
> > are basically guaranteed that OVMF will trigger hypercalls that expand
> > the bitmap beyond the top of memory, then, yes, that should work. That
> > leaves me slightly nervous that OVMF might regress since it's not
> > obvious that calling a hypercall beyond the top of memory would be
> > "required" for avoiding a somewhat indirectly related issue in guest
> > kernels.
>
>
> If possible then we should try to avoid growing/shrinking the bitmap .
> Today OVMF may not be accessing beyond memory but a malicious guest
> could send a hypercall down which can trigger a huge memory allocation
> on the host side and may eventually cause denial of service for other.
Nice catch! Was just writing up an email about this.
> I am in favor if we can find some solution to handle this case. How
> about Steve's suggestion about VMM making a call down to the kernel to
> tell how big the bitmap should be? Initially it should be equal to the
> guest RAM and if VMM ever did the memory expansion then it can send down
> another notification to increase the bitmap ?
>
> Optionally, instead of adding a new ioctl, I was wondering if we can
> extend the kvm_arch_prepare_memory_region() to make svm specific x86_ops
> which can take read the userspace provided memory region and calculate
> the amount of guest RAM managed by the KVM and grow/shrink the bitmap
> based on that information. I have not looked deep enough to see if its
> doable but if it can work then we can avoid adding yet another ioctl.
We also have the set bitmap ioctl in a later patch in this series. We
could also use the set ioctl for initialization (it's a little
excessive for initialization since there will be an additional
ephemeral allocation and a few additional buffer copies, but that's
probably fine). An enable_cap has the added benefit of probably being
necessary anyway so usermode can disable the migration feature flag.

In general, userspace is going to have to be in direct control of the
buffer and its size.
Kalra, Ashish April 8, 2020, 1:17 a.m. UTC | #11
Hello Steve, Brijesh,

On Tue, Apr 07, 2020 at 05:35:57PM -0700, Steve Rutherford wrote:
> On Tue, Apr 7, 2020 at 5:29 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
> >
> >
> > On 4/7/20 7:01 PM, Steve Rutherford wrote:
> > > On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> > >> Hello Steve,
> > >>
> > >> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
> > >>> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> > >>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
> > >>>>
> > >>>> This hypercall is used by the SEV guest to notify a change in the page
> > >>>> encryption status to the hypervisor. The hypercall should be invoked
> > >>>> only when the encryption attribute is changed from encrypted -> decrypted
> > >>>> and vice versa. By default all guest pages are considered encrypted.
> > >>>>
> > >>>> Cc: Thomas Gleixner <tglx@linutronix.de>
> > >>>> Cc: Ingo Molnar <mingo@redhat.com>
> > >>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
> > >>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> > >>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > >>>> Cc: Joerg Roedel <joro@8bytes.org>
> > >>>> Cc: Borislav Petkov <bp@suse.de>
> > >>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > >>>> Cc: x86@kernel.org
> > >>>> Cc: kvm@vger.kernel.org
> > >>>> Cc: linux-kernel@vger.kernel.org
> > >>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > >>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > >>>> ---
> > >>>>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
> > >>>>  arch/x86/include/asm/kvm_host.h       |  2 +
> > >>>>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> > >>>>  arch/x86/kvm/vmx/vmx.c                |  1 +
> > >>>>  arch/x86/kvm/x86.c                    |  6 ++
> > >>>>  include/uapi/linux/kvm_para.h         |  1 +
> > >>>>  6 files changed, 120 insertions(+)
> > >>>>
> > >>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > >>>> index dbaf207e560d..ff5287e68e81 100644
> > >>>> --- a/Documentation/virt/kvm/hypercalls.rst
> > >>>> +++ b/Documentation/virt/kvm/hypercalls.rst
> > >>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
> > >>>>
> > >>>>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> > >>>>                 any of the IPI target vCPUs was preempted.
> > >>>> +
> > >>>> +
> > >>>> +8. KVM_HC_PAGE_ENC_STATUS
> > >>>> +-------------------------
> > >>>> +:Architecture: x86
> > >>>> +:Status: active
> > >>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > >>>> +
> > >>>> +a0: the guest physical address of the start page
> > >>>> +a1: the number of pages
> > >>>> +a2: encryption attribute
> > >>>> +
> > >>>> +   Where:
> > >>>> +       * 1: Encryption attribute is set
> > >>>> +       * 0: Encryption attribute is cleared
> > >>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > >>>> index 98959e8cd448..90718fa3db47 100644
> > >>>> --- a/arch/x86/include/asm/kvm_host.h
> > >>>> +++ b/arch/x86/include/asm/kvm_host.h
> > >>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> > >>>>
> > >>>>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> > >>>>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> > >>>> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > >>>> +                                 unsigned long sz, unsigned long mode);
> > >>> Nit: spell out size instead of sz.
> > >>>>  };
> > >>>>
> > >>>>  struct kvm_arch_async_pf {
> > >>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > >>>> index 7c2721e18b06..1d8beaf1bceb 100644
> > >>>> --- a/arch/x86/kvm/svm.c
> > >>>> +++ b/arch/x86/kvm/svm.c
> > >>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
> > >>>>         int fd;                 /* SEV device fd */
> > >>>>         unsigned long pages_locked; /* Number of pages locked */
> > >>>>         struct list_head regions_list;  /* List of registered regions */
> > >>>> +       unsigned long *page_enc_bmap;
> > >>>> +       unsigned long page_enc_bmap_size;
> > >>>>  };
> > >>>>
> > >>>>  struct kvm_svm {
> > >>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> > >>>>
> > >>>>         sev_unbind_asid(kvm, sev->handle);
> > >>>>         sev_asid_free(sev->asid);
> > >>>> +
> > >>>> +       kvfree(sev->page_enc_bmap);
> > >>>> +       sev->page_enc_bmap = NULL;
> > >>>>  }
> > >>>>
> > >>>>  static void avic_vm_destroy(struct kvm *kvm)
> > >>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> > >>>>         return ret;
> > >>>>  }
> > >>>>
> > >>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> > >>>> +{
> > >>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > >>>> +       unsigned long *map;
> > >>>> +       unsigned long sz;
> > >>>> +
> > >>>> +       if (sev->page_enc_bmap_size >= new_size)
> > >>>> +               return 0;
> > >>>> +
> > >>>> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> > >>>> +
> > >>>> +       map = vmalloc(sz);
> > >>>> +       if (!map) {
> > >>>> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> > >>>> +                               sz);
> > >>>> +               return -ENOMEM;
> > >>>> +       }
> > >>>> +
> > >>>> +       /* mark the page encrypted (by default) */
> > >>>> +       memset(map, 0xff, sz);
> > >>>> +
> > >>>> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> > >>>> +       kvfree(sev->page_enc_bmap);
> > >>>> +
> > >>>> +       sev->page_enc_bmap = map;
> > >>>> +       sev->page_enc_bmap_size = new_size;
> > >>>> +
> > >>>> +       return 0;
> > >>>> +}
> > >>>> +
> > >>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> > >>>> +                                 unsigned long npages, unsigned long enc)
> > >>>> +{
> > >>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > >>>> +       kvm_pfn_t pfn_start, pfn_end;
> > >>>> +       gfn_t gfn_start, gfn_end;
> > >>>> +       int ret;
> > >>>> +
> > >>>> +       if (!sev_guest(kvm))
> > >>>> +               return -EINVAL;
> > >>>> +
> > >>>> +       if (!npages)
> > >>>> +               return 0;
> > >>>> +
> > >>>> +       gfn_start = gpa_to_gfn(gpa);
> > >>>> +       gfn_end = gfn_start + npages;
> > >>>> +
> > >>>> +       /* out of bound access error check */
> > >>>> +       if (gfn_end <= gfn_start)
> > >>>> +               return -EINVAL;
> > >>>> +
> > >>>> +       /* lets make sure that gpa exist in our memslot */
> > >>>> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> > >>>> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> > >>>> +
> > >>>> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> > >>>> +               /*
> > >>>> +                * Allow guest MMIO range(s) to be added
> > >>>> +                * to the page encryption bitmap.
> > >>>> +                */
> > >>>> +               return -EINVAL;
> > >>>> +       }
> > >>>> +
> > >>>> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> > >>>> +               /*
> > >>>> +                * Allow guest MMIO range(s) to be added
> > >>>> +                * to the page encryption bitmap.
> > >>>> +                */
> > >>>> +               return -EINVAL;
> > >>>> +       }
> > >>>> +
> > >>>> +       mutex_lock(&kvm->lock);
> > >>>> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> > >>>> +       if (ret)
> > >>>> +               goto unlock;
> > >>>> +
> > >>>> +       if (enc)
> > >>>> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> > >>>> +                               gfn_end - gfn_start);
> > >>>> +       else
> > >>>> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> > >>>> +                               gfn_end - gfn_start);
> > >>>> +
> > >>>> +unlock:
> > >>>> +       mutex_unlock(&kvm->lock);
> > >>>> +       return ret;
> > >>>> +}
> > >>>> +
> > >>>>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> > >>>>  {
> > >>>>         struct kvm_sev_cmd sev_cmd;
> > >>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> > >>>>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> > >>>>
> > >>>>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> > >>>> +
> > >>>> +       .page_enc_status_hc = svm_page_enc_status_hc,
> > >>>>  };
> > >>>>
> > >>>>  static int __init svm_init(void)
> > >>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > >>>> index 079d9fbf278e..f68e76ee7f9c 100644
> > >>>> --- a/arch/x86/kvm/vmx/vmx.c
> > >>>> +++ b/arch/x86/kvm/vmx/vmx.c
> > >>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> > >>>>         .nested_get_evmcs_version = NULL,
> > >>>>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> > >>>>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> > >>>> +       .page_enc_status_hc = NULL,
> > >>>>  };
> > >>>>
> > >>>>  static void vmx_cleanup_l1d_flush(void)
> > >>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > >>>> index cf95c36cb4f4..68428eef2dde 100644
> > >>>> --- a/arch/x86/kvm/x86.c
> > >>>> +++ b/arch/x86/kvm/x86.c
> > >>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> > >>>>                 kvm_sched_yield(vcpu->kvm, a0);
> > >>>>                 ret = 0;
> > >>>>                 break;
> > >>>> +       case KVM_HC_PAGE_ENC_STATUS:
> > >>>> +               ret = -KVM_ENOSYS;
> > >>>> +               if (kvm_x86_ops->page_enc_status_hc)
> > >>>> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> > >>>> +                                       a0, a1, a2);
> > >>>> +               break;
> > >>>>         default:
> > >>>>                 ret = -KVM_ENOSYS;
> > >>>>                 break;
> > >>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> > >>>> index 8b86609849b9..847b83b75dc8 100644
> > >>>> --- a/include/uapi/linux/kvm_para.h
> > >>>> +++ b/include/uapi/linux/kvm_para.h
> > >>>> @@ -29,6 +29,7 @@
> > >>>>  #define KVM_HC_CLOCK_PAIRING           9
> > >>>>  #define KVM_HC_SEND_IPI                10
> > >>>>  #define KVM_HC_SCHED_YIELD             11
> > >>>> +#define KVM_HC_PAGE_ENC_STATUS         12
> > >>>>
> > >>>>  /*
> > >>>>   * hypercalls use architecture specific
> > >>>> --
> > >>>> 2.17.1
> > >>>>
> > >>> I'm still not excited by the dynamic resizing. I believe the guest
> > >>> hypercall can be called in atomic contexts, which makes me
> > >>> particularly unexcited to see a potentially large vmalloc on the host
> > >>> followed by filling the buffer. Particularly when the buffer might be
> > >>> non-trivial in size (~1MB per 32GB, per some back of the envelope
> > >>> math).
> > >>>
> > >> I think looking at more practical situations, most hypercalls will
> > >> happen during the boot stage, when device specific initializations are
> > >> happening, so typically the maximum page encryption bitmap size would
> > >> be allocated early enough.
> > >>
> > >> In fact, initial hypercalls made by OVMF will probably allocate the
> > >> maximum page bitmap size even before the kernel comes up, especially
> > >> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
> > >> regions, PCI device memory, etc., and most importantly for
> > >> "non-existent" high memory range (which will probably be the
> > >> maximum size page encryption bitmap allocated/resized).
> > >>
> > >> Let me know if you have different thoughts on this ?
> > > Hi Ashish,
> > >
> > > If this is not an issue in practice, we can just move past this. If we
> > > are basically guaranteed that OVMF will trigger hypercalls that expand
> > > the bitmap beyond the top of memory, then, yes, that should work. That
> > > leaves me slightly nervous that OVMF might regress since it's not
> > > obvious that calling a hypercall beyond the top of memory would be
> > > "required" for avoiding a somewhat indirectly related issue in guest
> > > kernels.
> >
> >
> > If possible then we should try to avoid growing/shrinking the bitmap .
> > Today OVMF may not be accessing beyond memory but a malicious guest
> > could send a hypercall down which can trigger a huge memory allocation
> > on the host side and may eventually cause denial of service for other.
> Nice catch! Was just writing up an email about this.
> > I am in favor if we can find some solution to handle this case. How
> > about Steve's suggestion about VMM making a call down to the kernel to
> > tell how big the bitmap should be? Initially it should be equal to the
> > guest RAM and if VMM ever did the memory expansion then it can send down
> > another notification to increase the bitmap ?
> >
> > Optionally, instead of adding a new ioctl, I was wondering if we can
> > extend the kvm_arch_prepare_memory_region() to make svm specific x86_ops
> > which can take read the userspace provided memory region and calculate
> > the amount of guest RAM managed by the KVM and grow/shrink the bitmap
> > based on that information. I have not looked deep enough to see if its
> > doable but if it can work then we can avoid adding yet another ioctl.
> We also have the set bitmap ioctl in a later patch in this series. We
> could also use the set ioctl for initialization (it's a little
> excessive for initialization since there will be an additional
> ephemeral allocation and a few additional buffer copies, but that's
> probably fine). An enable_cap has the added benefit of probably being
> necessary anyway so usermode can disable the migration feature flag.
> 
> In general, userspace is going to have to be in direct control of the
> buffer and its size.

My only practical concern about setting a static bitmap size based on guest
memory is about the hypercalls being made initially by OVMF to set page
enc/dec status for ROM, ACPI regions and especially the non-existent
high memory range. The new ioctl will statically setup bitmap size to 
whatever guest RAM is specified, say 4G, 8G, etc., but the OVMF
hypercall for non-existent memory will try to do a hypercall for guest
physical memory range like ~6G->64G (for 4G guest RAM setup), this
hypercall will basically have to just return doing nothing, because
the allocated bitmap won't have this guest physical range available ?

Also, hypercalls for ROM, ACPI, device regions and any memory holes within
the static bitmap setup as per guest RAM config will work, but what
about hypercalls for any device regions beyond the guest RAM config ?

Thanks,
Ashish
Steve Rutherford April 8, 2020, 1:38 a.m. UTC | #12
On Tue, Apr 7, 2020 at 6:17 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
>
> Hello Steve, Brijesh,
>
> On Tue, Apr 07, 2020 at 05:35:57PM -0700, Steve Rutherford wrote:
> > On Tue, Apr 7, 2020 at 5:29 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
> > >
> > >
> > > On 4/7/20 7:01 PM, Steve Rutherford wrote:
> > > > On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> > > >> Hello Steve,
> > > >>
> > > >> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
> > > >>> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> > > >>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
> > > >>>>
> > > >>>> This hypercall is used by the SEV guest to notify a change in the page
> > > >>>> encryption status to the hypervisor. The hypercall should be invoked
> > > >>>> only when the encryption attribute is changed from encrypted -> decrypted
> > > >>>> and vice versa. By default all guest pages are considered encrypted.
> > > >>>>
> > > >>>> Cc: Thomas Gleixner <tglx@linutronix.de>
> > > >>>> Cc: Ingo Molnar <mingo@redhat.com>
> > > >>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
> > > >>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > >>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > > >>>> Cc: Joerg Roedel <joro@8bytes.org>
> > > >>>> Cc: Borislav Petkov <bp@suse.de>
> > > >>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > > >>>> Cc: x86@kernel.org
> > > >>>> Cc: kvm@vger.kernel.org
> > > >>>> Cc: linux-kernel@vger.kernel.org
> > > >>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > > >>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > > >>>> ---
> > > >>>>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
> > > >>>>  arch/x86/include/asm/kvm_host.h       |  2 +
> > > >>>>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> > > >>>>  arch/x86/kvm/vmx/vmx.c                |  1 +
> > > >>>>  arch/x86/kvm/x86.c                    |  6 ++
> > > >>>>  include/uapi/linux/kvm_para.h         |  1 +
> > > >>>>  6 files changed, 120 insertions(+)
> > > >>>>
> > > >>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > > >>>> index dbaf207e560d..ff5287e68e81 100644
> > > >>>> --- a/Documentation/virt/kvm/hypercalls.rst
> > > >>>> +++ b/Documentation/virt/kvm/hypercalls.rst
> > > >>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
> > > >>>>
> > > >>>>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> > > >>>>                 any of the IPI target vCPUs was preempted.
> > > >>>> +
> > > >>>> +
> > > >>>> +8. KVM_HC_PAGE_ENC_STATUS
> > > >>>> +-------------------------
> > > >>>> +:Architecture: x86
> > > >>>> +:Status: active
> > > >>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > > >>>> +
> > > >>>> +a0: the guest physical address of the start page
> > > >>>> +a1: the number of pages
> > > >>>> +a2: encryption attribute
> > > >>>> +
> > > >>>> +   Where:
> > > >>>> +       * 1: Encryption attribute is set
> > > >>>> +       * 0: Encryption attribute is cleared
> > > >>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > >>>> index 98959e8cd448..90718fa3db47 100644
> > > >>>> --- a/arch/x86/include/asm/kvm_host.h
> > > >>>> +++ b/arch/x86/include/asm/kvm_host.h
> > > >>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> > > >>>>
> > > >>>>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> > > >>>>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> > > >>>> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > > >>>> +                                 unsigned long sz, unsigned long mode);
> > > >>> Nit: spell out size instead of sz.
> > > >>>>  };
> > > >>>>
> > > >>>>  struct kvm_arch_async_pf {
> > > >>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > > >>>> index 7c2721e18b06..1d8beaf1bceb 100644
> > > >>>> --- a/arch/x86/kvm/svm.c
> > > >>>> +++ b/arch/x86/kvm/svm.c
> > > >>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
> > > >>>>         int fd;                 /* SEV device fd */
> > > >>>>         unsigned long pages_locked; /* Number of pages locked */
> > > >>>>         struct list_head regions_list;  /* List of registered regions */
> > > >>>> +       unsigned long *page_enc_bmap;
> > > >>>> +       unsigned long page_enc_bmap_size;
> > > >>>>  };
> > > >>>>
> > > >>>>  struct kvm_svm {
> > > >>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> > > >>>>
> > > >>>>         sev_unbind_asid(kvm, sev->handle);
> > > >>>>         sev_asid_free(sev->asid);
> > > >>>> +
> > > >>>> +       kvfree(sev->page_enc_bmap);
> > > >>>> +       sev->page_enc_bmap = NULL;
> > > >>>>  }
> > > >>>>
> > > >>>>  static void avic_vm_destroy(struct kvm *kvm)
> > > >>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> > > >>>>         return ret;
> > > >>>>  }
> > > >>>>
> > > >>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> > > >>>> +{
> > > >>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > > >>>> +       unsigned long *map;
> > > >>>> +       unsigned long sz;
> > > >>>> +
> > > >>>> +       if (sev->page_enc_bmap_size >= new_size)
> > > >>>> +               return 0;
> > > >>>> +
> > > >>>> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> > > >>>> +
> > > >>>> +       map = vmalloc(sz);
> > > >>>> +       if (!map) {
> > > >>>> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> > > >>>> +                               sz);
> > > >>>> +               return -ENOMEM;
> > > >>>> +       }
> > > >>>> +
> > > >>>> +       /* mark the page encrypted (by default) */
> > > >>>> +       memset(map, 0xff, sz);
> > > >>>> +
> > > >>>> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> > > >>>> +       kvfree(sev->page_enc_bmap);
> > > >>>> +
> > > >>>> +       sev->page_enc_bmap = map;
> > > >>>> +       sev->page_enc_bmap_size = new_size;
> > > >>>> +
> > > >>>> +       return 0;
> > > >>>> +}
> > > >>>> +
> > > >>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> > > >>>> +                                 unsigned long npages, unsigned long enc)
> > > >>>> +{
> > > >>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > > >>>> +       kvm_pfn_t pfn_start, pfn_end;
> > > >>>> +       gfn_t gfn_start, gfn_end;
> > > >>>> +       int ret;
> > > >>>> +
> > > >>>> +       if (!sev_guest(kvm))
> > > >>>> +               return -EINVAL;
> > > >>>> +
> > > >>>> +       if (!npages)
> > > >>>> +               return 0;
> > > >>>> +
> > > >>>> +       gfn_start = gpa_to_gfn(gpa);
> > > >>>> +       gfn_end = gfn_start + npages;
> > > >>>> +
> > > >>>> +       /* out of bound access error check */
> > > >>>> +       if (gfn_end <= gfn_start)
> > > >>>> +               return -EINVAL;
> > > >>>> +
> > > >>>> +       /* lets make sure that gpa exist in our memslot */
> > > >>>> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> > > >>>> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> > > >>>> +
> > > >>>> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> > > >>>> +               /*
> > > >>>> +                * Allow guest MMIO range(s) to be added
> > > >>>> +                * to the page encryption bitmap.
> > > >>>> +                */
> > > >>>> +               return -EINVAL;
> > > >>>> +       }
> > > >>>> +
> > > >>>> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> > > >>>> +               /*
> > > >>>> +                * Allow guest MMIO range(s) to be added
> > > >>>> +                * to the page encryption bitmap.
> > > >>>> +                */
> > > >>>> +               return -EINVAL;
> > > >>>> +       }
> > > >>>> +
> > > >>>> +       mutex_lock(&kvm->lock);
> > > >>>> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> > > >>>> +       if (ret)
> > > >>>> +               goto unlock;
> > > >>>> +
> > > >>>> +       if (enc)
> > > >>>> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> > > >>>> +                               gfn_end - gfn_start);
> > > >>>> +       else
> > > >>>> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> > > >>>> +                               gfn_end - gfn_start);
> > > >>>> +
> > > >>>> +unlock:
> > > >>>> +       mutex_unlock(&kvm->lock);
> > > >>>> +       return ret;
> > > >>>> +}
> > > >>>> +
> > > >>>>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> > > >>>>  {
> > > >>>>         struct kvm_sev_cmd sev_cmd;
> > > >>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> > > >>>>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> > > >>>>
> > > >>>>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> > > >>>> +
> > > >>>> +       .page_enc_status_hc = svm_page_enc_status_hc,
> > > >>>>  };
> > > >>>>
> > > >>>>  static int __init svm_init(void)
> > > >>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > >>>> index 079d9fbf278e..f68e76ee7f9c 100644
> > > >>>> --- a/arch/x86/kvm/vmx/vmx.c
> > > >>>> +++ b/arch/x86/kvm/vmx/vmx.c
> > > >>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> > > >>>>         .nested_get_evmcs_version = NULL,
> > > >>>>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> > > >>>>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> > > >>>> +       .page_enc_status_hc = NULL,
> > > >>>>  };
> > > >>>>
> > > >>>>  static void vmx_cleanup_l1d_flush(void)
> > > >>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > >>>> index cf95c36cb4f4..68428eef2dde 100644
> > > >>>> --- a/arch/x86/kvm/x86.c
> > > >>>> +++ b/arch/x86/kvm/x86.c
> > > >>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> > > >>>>                 kvm_sched_yield(vcpu->kvm, a0);
> > > >>>>                 ret = 0;
> > > >>>>                 break;
> > > >>>> +       case KVM_HC_PAGE_ENC_STATUS:
> > > >>>> +               ret = -KVM_ENOSYS;
> > > >>>> +               if (kvm_x86_ops->page_enc_status_hc)
> > > >>>> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> > > >>>> +                                       a0, a1, a2);
> > > >>>> +               break;
> > > >>>>         default:
> > > >>>>                 ret = -KVM_ENOSYS;
> > > >>>>                 break;
> > > >>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> > > >>>> index 8b86609849b9..847b83b75dc8 100644
> > > >>>> --- a/include/uapi/linux/kvm_para.h
> > > >>>> +++ b/include/uapi/linux/kvm_para.h
> > > >>>> @@ -29,6 +29,7 @@
> > > >>>>  #define KVM_HC_CLOCK_PAIRING           9
> > > >>>>  #define KVM_HC_SEND_IPI                10
> > > >>>>  #define KVM_HC_SCHED_YIELD             11
> > > >>>> +#define KVM_HC_PAGE_ENC_STATUS         12
> > > >>>>
> > > >>>>  /*
> > > >>>>   * hypercalls use architecture specific
> > > >>>> --
> > > >>>> 2.17.1
> > > >>>>
> > > >>> I'm still not excited by the dynamic resizing. I believe the guest
> > > >>> hypercall can be called in atomic contexts, which makes me
> > > >>> particularly unexcited to see a potentially large vmalloc on the host
> > > >>> followed by filling the buffer. Particularly when the buffer might be
> > > >>> non-trivial in size (~1MB per 32GB, per some back of the envelope
> > > >>> math).
> > > >>>
> > > >> I think looking at more practical situations, most hypercalls will
> > > >> happen during the boot stage, when device specific initializations are
> > > >> happening, so typically the maximum page encryption bitmap size would
> > > >> be allocated early enough.
> > > >>
> > > >> In fact, initial hypercalls made by OVMF will probably allocate the
> > > >> maximum page bitmap size even before the kernel comes up, especially
> > > >> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
> > > >> regions, PCI device memory, etc., and most importantly for
> > > >> "non-existent" high memory range (which will probably be the
> > > >> maximum size page encryption bitmap allocated/resized).
> > > >>
> > > >> Let me know if you have different thoughts on this ?
> > > > Hi Ashish,
> > > >
> > > > If this is not an issue in practice, we can just move past this. If we
> > > > are basically guaranteed that OVMF will trigger hypercalls that expand
> > > > the bitmap beyond the top of memory, then, yes, that should work. That
> > > > leaves me slightly nervous that OVMF might regress since it's not
> > > > obvious that calling a hypercall beyond the top of memory would be
> > > > "required" for avoiding a somewhat indirectly related issue in guest
> > > > kernels.
> > >
> > >
> > > If possible then we should try to avoid growing/shrinking the bitmap .
> > > Today OVMF may not be accessing beyond memory but a malicious guest
> > > could send a hypercall down which can trigger a huge memory allocation
> > > on the host side and may eventually cause denial of service for other.
> > Nice catch! Was just writing up an email about this.
> > > I am in favor if we can find some solution to handle this case. How
> > > about Steve's suggestion about VMM making a call down to the kernel to
> > > tell how big the bitmap should be? Initially it should be equal to the
> > > guest RAM and if VMM ever did the memory expansion then it can send down
> > > another notification to increase the bitmap ?
> > >
> > > Optionally, instead of adding a new ioctl, I was wondering if we can
> > > extend the kvm_arch_prepare_memory_region() to make svm specific x86_ops
> > > which can take read the userspace provided memory region and calculate
> > > the amount of guest RAM managed by the KVM and grow/shrink the bitmap
> > > based on that information. I have not looked deep enough to see if its
> > > doable but if it can work then we can avoid adding yet another ioctl.
> > We also have the set bitmap ioctl in a later patch in this series. We
> > could also use the set ioctl for initialization (it's a little
> > excessive for initialization since there will be an additional
> > ephemeral allocation and a few additional buffer copies, but that's
> > probably fine). An enable_cap has the added benefit of probably being
> > necessary anyway so usermode can disable the migration feature flag.
> >
> > In general, userspace is going to have to be in direct control of the
> > buffer and its size.
>
> My only practical concern about setting a static bitmap size based on guest
> memory is about the hypercalls being made initially by OVMF to set page
> enc/dec status for ROM, ACPI regions and especially the non-existent
> high memory range. The new ioctl will statically setup bitmap size to
> whatever guest RAM is specified, say 4G, 8G, etc., but the OVMF
> hypercall for non-existent memory will try to do a hypercall for guest
> physical memory range like ~6G->64G (for 4G guest RAM setup), this
> hypercall will basically have to just return doing nothing, because
> the allocated bitmap won't have this guest physical range available ?
>
> Also, hypercalls for ROM, ACPI, device regions and any memory holes within
> the static bitmap setup as per guest RAM config will work, but what
> about hypercalls for any device regions beyond the guest RAM config ?
>
> Thanks,
> Ashish
I'm not super familiar with what the address beyond the top of ram is
used for. If the memory is not backed by RAM, will it even matter for
migration? Sounds like the encryption for SEV won't even apply to it.
If we don't need to know what the c-bit state of an address is, we
don't need to track it. It doesn't hurt to track it (which is why I'm
not super concerned about tracking the memory holes).
Brijesh Singh April 8, 2020, 2:34 a.m. UTC | #13
On 4/7/20 8:38 PM, Steve Rutherford wrote:
> On Tue, Apr 7, 2020 at 6:17 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
>> Hello Steve, Brijesh,
>>
>> On Tue, Apr 07, 2020 at 05:35:57PM -0700, Steve Rutherford wrote:
>>> On Tue, Apr 7, 2020 at 5:29 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
>>>>
>>>> On 4/7/20 7:01 PM, Steve Rutherford wrote:
>>>>> On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
>>>>>> Hello Steve,
>>>>>>
>>>>>> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
>>>>>>> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
>>>>>>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
>>>>>>>>
>>>>>>>> This hypercall is used by the SEV guest to notify a change in the page
>>>>>>>> encryption status to the hypervisor. The hypercall should be invoked
>>>>>>>> only when the encryption attribute is changed from encrypted -> decrypted
>>>>>>>> and vice versa. By default all guest pages are considered encrypted.
>>>>>>>>
>>>>>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>>>>>>> Cc: Ingo Molnar <mingo@redhat.com>
>>>>>>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>>>>>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>>>>>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
>>>>>>>> Cc: Joerg Roedel <joro@8bytes.org>
>>>>>>>> Cc: Borislav Petkov <bp@suse.de>
>>>>>>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
>>>>>>>> Cc: x86@kernel.org
>>>>>>>> Cc: kvm@vger.kernel.org
>>>>>>>> Cc: linux-kernel@vger.kernel.org
>>>>>>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>>>>>>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
>>>>>>>> ---
>>>>>>>>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
>>>>>>>>  arch/x86/include/asm/kvm_host.h       |  2 +
>>>>>>>>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
>>>>>>>>  arch/x86/kvm/vmx/vmx.c                |  1 +
>>>>>>>>  arch/x86/kvm/x86.c                    |  6 ++
>>>>>>>>  include/uapi/linux/kvm_para.h         |  1 +
>>>>>>>>  6 files changed, 120 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
>>>>>>>> index dbaf207e560d..ff5287e68e81 100644
>>>>>>>> --- a/Documentation/virt/kvm/hypercalls.rst
>>>>>>>> +++ b/Documentation/virt/kvm/hypercalls.rst
>>>>>>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
>>>>>>>>
>>>>>>>>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
>>>>>>>>                 any of the IPI target vCPUs was preempted.
>>>>>>>> +
>>>>>>>> +
>>>>>>>> +8. KVM_HC_PAGE_ENC_STATUS
>>>>>>>> +-------------------------
>>>>>>>> +:Architecture: x86
>>>>>>>> +:Status: active
>>>>>>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
>>>>>>>> +
>>>>>>>> +a0: the guest physical address of the start page
>>>>>>>> +a1: the number of pages
>>>>>>>> +a2: encryption attribute
>>>>>>>> +
>>>>>>>> +   Where:
>>>>>>>> +       * 1: Encryption attribute is set
>>>>>>>> +       * 0: Encryption attribute is cleared
>>>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>>>>>> index 98959e8cd448..90718fa3db47 100644
>>>>>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>>>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>>>>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
>>>>>>>>
>>>>>>>>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
>>>>>>>>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
>>>>>>>> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
>>>>>>>> +                                 unsigned long sz, unsigned long mode);
>>>>>>> Nit: spell out size instead of sz.
>>>>>>>>  };
>>>>>>>>
>>>>>>>>  struct kvm_arch_async_pf {
>>>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>>>>>> index 7c2721e18b06..1d8beaf1bceb 100644
>>>>>>>> --- a/arch/x86/kvm/svm.c
>>>>>>>> +++ b/arch/x86/kvm/svm.c
>>>>>>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
>>>>>>>>         int fd;                 /* SEV device fd */
>>>>>>>>         unsigned long pages_locked; /* Number of pages locked */
>>>>>>>>         struct list_head regions_list;  /* List of registered regions */
>>>>>>>> +       unsigned long *page_enc_bmap;
>>>>>>>> +       unsigned long page_enc_bmap_size;
>>>>>>>>  };
>>>>>>>>
>>>>>>>>  struct kvm_svm {
>>>>>>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
>>>>>>>>
>>>>>>>>         sev_unbind_asid(kvm, sev->handle);
>>>>>>>>         sev_asid_free(sev->asid);
>>>>>>>> +
>>>>>>>> +       kvfree(sev->page_enc_bmap);
>>>>>>>> +       sev->page_enc_bmap = NULL;
>>>>>>>>  }
>>>>>>>>
>>>>>>>>  static void avic_vm_destroy(struct kvm *kvm)
>>>>>>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>>>>>>         return ret;
>>>>>>>>  }
>>>>>>>>
>>>>>>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
>>>>>>>> +{
>>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>>>> +       unsigned long *map;
>>>>>>>> +       unsigned long sz;
>>>>>>>> +
>>>>>>>> +       if (sev->page_enc_bmap_size >= new_size)
>>>>>>>> +               return 0;
>>>>>>>> +
>>>>>>>> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
>>>>>>>> +
>>>>>>>> +       map = vmalloc(sz);
>>>>>>>> +       if (!map) {
>>>>>>>> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
>>>>>>>> +                               sz);
>>>>>>>> +               return -ENOMEM;
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> +       /* mark the page encrypted (by default) */
>>>>>>>> +       memset(map, 0xff, sz);
>>>>>>>> +
>>>>>>>> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
>>>>>>>> +       kvfree(sev->page_enc_bmap);
>>>>>>>> +
>>>>>>>> +       sev->page_enc_bmap = map;
>>>>>>>> +       sev->page_enc_bmap_size = new_size;
>>>>>>>> +
>>>>>>>> +       return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
>>>>>>>> +                                 unsigned long npages, unsigned long enc)
>>>>>>>> +{
>>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>>>> +       kvm_pfn_t pfn_start, pfn_end;
>>>>>>>> +       gfn_t gfn_start, gfn_end;
>>>>>>>> +       int ret;
>>>>>>>> +
>>>>>>>> +       if (!sev_guest(kvm))
>>>>>>>> +               return -EINVAL;
>>>>>>>> +
>>>>>>>> +       if (!npages)
>>>>>>>> +               return 0;
>>>>>>>> +
>>>>>>>> +       gfn_start = gpa_to_gfn(gpa);
>>>>>>>> +       gfn_end = gfn_start + npages;
>>>>>>>> +
>>>>>>>> +       /* out of bound access error check */
>>>>>>>> +       if (gfn_end <= gfn_start)
>>>>>>>> +               return -EINVAL;
>>>>>>>> +
>>>>>>>> +       /* lets make sure that gpa exist in our memslot */
>>>>>>>> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
>>>>>>>> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
>>>>>>>> +
>>>>>>>> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
>>>>>>>> +               /*
>>>>>>>> +                * Allow guest MMIO range(s) to be added
>>>>>>>> +                * to the page encryption bitmap.
>>>>>>>> +                */
>>>>>>>> +               return -EINVAL;
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
>>>>>>>> +               /*
>>>>>>>> +                * Allow guest MMIO range(s) to be added
>>>>>>>> +                * to the page encryption bitmap.
>>>>>>>> +                */
>>>>>>>> +               return -EINVAL;
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> +       mutex_lock(&kvm->lock);
>>>>>>>> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
>>>>>>>> +       if (ret)
>>>>>>>> +               goto unlock;
>>>>>>>> +
>>>>>>>> +       if (enc)
>>>>>>>> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
>>>>>>>> +                               gfn_end - gfn_start);
>>>>>>>> +       else
>>>>>>>> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
>>>>>>>> +                               gfn_end - gfn_start);
>>>>>>>> +
>>>>>>>> +unlock:
>>>>>>>> +       mutex_unlock(&kvm->lock);
>>>>>>>> +       return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>>>>>>  {
>>>>>>>>         struct kvm_sev_cmd sev_cmd;
>>>>>>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
>>>>>>>>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
>>>>>>>>
>>>>>>>>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
>>>>>>>> +
>>>>>>>> +       .page_enc_status_hc = svm_page_enc_status_hc,
>>>>>>>>  };
>>>>>>>>
>>>>>>>>  static int __init svm_init(void)
>>>>>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>>>>>>>> index 079d9fbf278e..f68e76ee7f9c 100644
>>>>>>>> --- a/arch/x86/kvm/vmx/vmx.c
>>>>>>>> +++ b/arch/x86/kvm/vmx/vmx.c
>>>>>>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>>>>>>>>         .nested_get_evmcs_version = NULL,
>>>>>>>>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
>>>>>>>>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
>>>>>>>> +       .page_enc_status_hc = NULL,
>>>>>>>>  };
>>>>>>>>
>>>>>>>>  static void vmx_cleanup_l1d_flush(void)
>>>>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>>>>>> index cf95c36cb4f4..68428eef2dde 100644
>>>>>>>> --- a/arch/x86/kvm/x86.c
>>>>>>>> +++ b/arch/x86/kvm/x86.c
>>>>>>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>>>>>>>>                 kvm_sched_yield(vcpu->kvm, a0);
>>>>>>>>                 ret = 0;
>>>>>>>>                 break;
>>>>>>>> +       case KVM_HC_PAGE_ENC_STATUS:
>>>>>>>> +               ret = -KVM_ENOSYS;
>>>>>>>> +               if (kvm_x86_ops->page_enc_status_hc)
>>>>>>>> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
>>>>>>>> +                                       a0, a1, a2);
>>>>>>>> +               break;
>>>>>>>>         default:
>>>>>>>>                 ret = -KVM_ENOSYS;
>>>>>>>>                 break;
>>>>>>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
>>>>>>>> index 8b86609849b9..847b83b75dc8 100644
>>>>>>>> --- a/include/uapi/linux/kvm_para.h
>>>>>>>> +++ b/include/uapi/linux/kvm_para.h
>>>>>>>> @@ -29,6 +29,7 @@
>>>>>>>>  #define KVM_HC_CLOCK_PAIRING           9
>>>>>>>>  #define KVM_HC_SEND_IPI                10
>>>>>>>>  #define KVM_HC_SCHED_YIELD             11
>>>>>>>> +#define KVM_HC_PAGE_ENC_STATUS         12
>>>>>>>>
>>>>>>>>  /*
>>>>>>>>   * hypercalls use architecture specific
>>>>>>>> --
>>>>>>>> 2.17.1
>>>>>>>>
>>>>>>> I'm still not excited by the dynamic resizing. I believe the guest
>>>>>>> hypercall can be called in atomic contexts, which makes me
>>>>>>> particularly unexcited to see a potentially large vmalloc on the host
>>>>>>> followed by filling the buffer. Particularly when the buffer might be
>>>>>>> non-trivial in size (~1MB per 32GB, per some back of the envelope
>>>>>>> math).
>>>>>>>
>>>>>> I think looking at more practical situations, most hypercalls will
>>>>>> happen during the boot stage, when device specific initializations are
>>>>>> happening, so typically the maximum page encryption bitmap size would
>>>>>> be allocated early enough.
>>>>>>
>>>>>> In fact, initial hypercalls made by OVMF will probably allocate the
>>>>>> maximum page bitmap size even before the kernel comes up, especially
>>>>>> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
>>>>>> regions, PCI device memory, etc., and most importantly for
>>>>>> "non-existent" high memory range (which will probably be the
>>>>>> maximum size page encryption bitmap allocated/resized).
>>>>>>
>>>>>> Let me know if you have different thoughts on this ?
>>>>> Hi Ashish,
>>>>>
>>>>> If this is not an issue in practice, we can just move past this. If we
>>>>> are basically guaranteed that OVMF will trigger hypercalls that expand
>>>>> the bitmap beyond the top of memory, then, yes, that should work. That
>>>>> leaves me slightly nervous that OVMF might regress since it's not
>>>>> obvious that calling a hypercall beyond the top of memory would be
>>>>> "required" for avoiding a somewhat indirectly related issue in guest
>>>>> kernels.
>>>>
>>>> If possible then we should try to avoid growing/shrinking the bitmap .
>>>> Today OVMF may not be accessing beyond memory but a malicious guest
>>>> could send a hypercall down which can trigger a huge memory allocation
>>>> on the host side and may eventually cause denial of service for other.
>>> Nice catch! Was just writing up an email about this.
>>>> I am in favor if we can find some solution to handle this case. How
>>>> about Steve's suggestion about VMM making a call down to the kernel to
>>>> tell how big the bitmap should be? Initially it should be equal to the
>>>> guest RAM and if VMM ever did the memory expansion then it can send down
>>>> another notification to increase the bitmap ?
>>>>
>>>> Optionally, instead of adding a new ioctl, I was wondering if we can
>>>> extend the kvm_arch_prepare_memory_region() to make svm specific x86_ops
>>>> which can take read the userspace provided memory region and calculate
>>>> the amount of guest RAM managed by the KVM and grow/shrink the bitmap
>>>> based on that information. I have not looked deep enough to see if its
>>>> doable but if it can work then we can avoid adding yet another ioctl.
>>> We also have the set bitmap ioctl in a later patch in this series. We
>>> could also use the set ioctl for initialization (it's a little
>>> excessive for initialization since there will be an additional
>>> ephemeral allocation and a few additional buffer copies, but that's
>>> probably fine). An enable_cap has the added benefit of probably being
>>> necessary anyway so usermode can disable the migration feature flag.
>>>
>>> In general, userspace is going to have to be in direct control of the
>>> buffer and its size.
>> My only practical concern about setting a static bitmap size based on guest
>> memory is about the hypercalls being made initially by OVMF to set page
>> enc/dec status for ROM, ACPI regions and especially the non-existent
>> high memory range. The new ioctl will statically setup bitmap size to
>> whatever guest RAM is specified, say 4G, 8G, etc., but the OVMF
>> hypercall for non-existent memory will try to do a hypercall for guest
>> physical memory range like ~6G->64G (for 4G guest RAM setup), this
>> hypercall will basically have to just return doing nothing, because
>> the allocated bitmap won't have this guest physical range available ?


IMO, Ovmf issuing a hypercall beyond the guest RAM is simple wrong, it
should *not* do that.  There was a feature request I submitted sometime
back to Tianocore https://bugzilla.tianocore.org/show_bug.cgi?id=623 as
I saw this coming in future. I tried highlighting the problem in the
MdeModulePkg that it does not provide a notifier to tell OVMF when core
creates the MMIO holes etc. It was not a big problem with the SEV
initially because we were never getting down to hypervisor to do
something about those non-existent regions. But with the migration its
now important that we should restart the discussion with UEFI folks and
see what can be done. In the kernel patches we should do what is right
for the kernel and not workaround the Ovmf limitation.


>> Also, hypercalls for ROM, ACPI, device regions and any memory holes within
>> the static bitmap setup as per guest RAM config will work, but what
>> about hypercalls for any device regions beyond the guest RAM config ?
>>
>> Thanks,
>> Ashish
> I'm not super familiar with what the address beyond the top of ram is
> used for. If the memory is not backed by RAM, will it even matter for
> migration? Sounds like the encryption for SEV won't even apply to it.
> If we don't need to know what the c-bit state of an address is, we
> don't need to track it. It doesn't hurt to track it (which is why I'm
> not super concerned about tracking the memory holes).
Kalra, Ashish April 8, 2020, 3:18 a.m. UTC | #14
Hello Brijesh,

On Tue, Apr 07, 2020 at 09:34:15PM -0500, Brijesh Singh wrote:
> 
> On 4/7/20 8:38 PM, Steve Rutherford wrote:
> > On Tue, Apr 7, 2020 at 6:17 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> >> Hello Steve, Brijesh,
> >>
> >> On Tue, Apr 07, 2020 at 05:35:57PM -0700, Steve Rutherford wrote:
> >>> On Tue, Apr 7, 2020 at 5:29 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
> >>>>
> >>>> On 4/7/20 7:01 PM, Steve Rutherford wrote:
> >>>>> On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> >>>>>> Hello Steve,
> >>>>>>
> >>>>>> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
> >>>>>>> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> >>>>>>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
> >>>>>>>>
> >>>>>>>> This hypercall is used by the SEV guest to notify a change in the page
> >>>>>>>> encryption status to the hypervisor. The hypercall should be invoked
> >>>>>>>> only when the encryption attribute is changed from encrypted -> decrypted
> >>>>>>>> and vice versa. By default all guest pages are considered encrypted.
> >>>>>>>>
> >>>>>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
> >>>>>>>> Cc: Ingo Molnar <mingo@redhat.com>
> >>>>>>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
> >>>>>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> >>>>>>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> >>>>>>>> Cc: Joerg Roedel <joro@8bytes.org>
> >>>>>>>> Cc: Borislav Petkov <bp@suse.de>
> >>>>>>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> >>>>>>>> Cc: x86@kernel.org
> >>>>>>>> Cc: kvm@vger.kernel.org
> >>>>>>>> Cc: linux-kernel@vger.kernel.org
> >>>>>>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> >>>>>>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> >>>>>>>> ---
> >>>>>>>>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
> >>>>>>>>  arch/x86/include/asm/kvm_host.h       |  2 +
> >>>>>>>>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> >>>>>>>>  arch/x86/kvm/vmx/vmx.c                |  1 +
> >>>>>>>>  arch/x86/kvm/x86.c                    |  6 ++
> >>>>>>>>  include/uapi/linux/kvm_para.h         |  1 +
> >>>>>>>>  6 files changed, 120 insertions(+)
> >>>>>>>>
> >>>>>>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> >>>>>>>> index dbaf207e560d..ff5287e68e81 100644
> >>>>>>>> --- a/Documentation/virt/kvm/hypercalls.rst
> >>>>>>>> +++ b/Documentation/virt/kvm/hypercalls.rst
> >>>>>>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
> >>>>>>>>
> >>>>>>>>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> >>>>>>>>                 any of the IPI target vCPUs was preempted.
> >>>>>>>> +
> >>>>>>>> +
> >>>>>>>> +8. KVM_HC_PAGE_ENC_STATUS
> >>>>>>>> +-------------------------
> >>>>>>>> +:Architecture: x86
> >>>>>>>> +:Status: active
> >>>>>>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> >>>>>>>> +
> >>>>>>>> +a0: the guest physical address of the start page
> >>>>>>>> +a1: the number of pages
> >>>>>>>> +a2: encryption attribute
> >>>>>>>> +
> >>>>>>>> +   Where:
> >>>>>>>> +       * 1: Encryption attribute is set
> >>>>>>>> +       * 0: Encryption attribute is cleared
> >>>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >>>>>>>> index 98959e8cd448..90718fa3db47 100644
> >>>>>>>> --- a/arch/x86/include/asm/kvm_host.h
> >>>>>>>> +++ b/arch/x86/include/asm/kvm_host.h
> >>>>>>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> >>>>>>>>
> >>>>>>>>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> >>>>>>>>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> >>>>>>>> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> >>>>>>>> +                                 unsigned long sz, unsigned long mode);
> >>>>>>> Nit: spell out size instead of sz.
> >>>>>>>>  };
> >>>>>>>>
> >>>>>>>>  struct kvm_arch_async_pf {
> >>>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> >>>>>>>> index 7c2721e18b06..1d8beaf1bceb 100644
> >>>>>>>> --- a/arch/x86/kvm/svm.c
> >>>>>>>> +++ b/arch/x86/kvm/svm.c
> >>>>>>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
> >>>>>>>>         int fd;                 /* SEV device fd */
> >>>>>>>>         unsigned long pages_locked; /* Number of pages locked */
> >>>>>>>>         struct list_head regions_list;  /* List of registered regions */
> >>>>>>>> +       unsigned long *page_enc_bmap;
> >>>>>>>> +       unsigned long page_enc_bmap_size;
> >>>>>>>>  };
> >>>>>>>>
> >>>>>>>>  struct kvm_svm {
> >>>>>>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> >>>>>>>>
> >>>>>>>>         sev_unbind_asid(kvm, sev->handle);
> >>>>>>>>         sev_asid_free(sev->asid);
> >>>>>>>> +
> >>>>>>>> +       kvfree(sev->page_enc_bmap);
> >>>>>>>> +       sev->page_enc_bmap = NULL;
> >>>>>>>>  }
> >>>>>>>>
> >>>>>>>>  static void avic_vm_destroy(struct kvm *kvm)
> >>>>>>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >>>>>>>>         return ret;
> >>>>>>>>  }
> >>>>>>>>
> >>>>>>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> >>>>>>>> +{
> >>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >>>>>>>> +       unsigned long *map;
> >>>>>>>> +       unsigned long sz;
> >>>>>>>> +
> >>>>>>>> +       if (sev->page_enc_bmap_size >= new_size)
> >>>>>>>> +               return 0;
> >>>>>>>> +
> >>>>>>>> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> >>>>>>>> +
> >>>>>>>> +       map = vmalloc(sz);
> >>>>>>>> +       if (!map) {
> >>>>>>>> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> >>>>>>>> +                               sz);
> >>>>>>>> +               return -ENOMEM;
> >>>>>>>> +       }
> >>>>>>>> +
> >>>>>>>> +       /* mark the page encrypted (by default) */
> >>>>>>>> +       memset(map, 0xff, sz);
> >>>>>>>> +
> >>>>>>>> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> >>>>>>>> +       kvfree(sev->page_enc_bmap);
> >>>>>>>> +
> >>>>>>>> +       sev->page_enc_bmap = map;
> >>>>>>>> +       sev->page_enc_bmap_size = new_size;
> >>>>>>>> +
> >>>>>>>> +       return 0;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> >>>>>>>> +                                 unsigned long npages, unsigned long enc)
> >>>>>>>> +{
> >>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >>>>>>>> +       kvm_pfn_t pfn_start, pfn_end;
> >>>>>>>> +       gfn_t gfn_start, gfn_end;
> >>>>>>>> +       int ret;
> >>>>>>>> +
> >>>>>>>> +       if (!sev_guest(kvm))
> >>>>>>>> +               return -EINVAL;
> >>>>>>>> +
> >>>>>>>> +       if (!npages)
> >>>>>>>> +               return 0;
> >>>>>>>> +
> >>>>>>>> +       gfn_start = gpa_to_gfn(gpa);
> >>>>>>>> +       gfn_end = gfn_start + npages;
> >>>>>>>> +
> >>>>>>>> +       /* out of bound access error check */
> >>>>>>>> +       if (gfn_end <= gfn_start)
> >>>>>>>> +               return -EINVAL;
> >>>>>>>> +
> >>>>>>>> +       /* lets make sure that gpa exist in our memslot */
> >>>>>>>> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> >>>>>>>> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> >>>>>>>> +
> >>>>>>>> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> >>>>>>>> +               /*
> >>>>>>>> +                * Allow guest MMIO range(s) to be added
> >>>>>>>> +                * to the page encryption bitmap.
> >>>>>>>> +                */
> >>>>>>>> +               return -EINVAL;
> >>>>>>>> +       }
> >>>>>>>> +
> >>>>>>>> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> >>>>>>>> +               /*
> >>>>>>>> +                * Allow guest MMIO range(s) to be added
> >>>>>>>> +                * to the page encryption bitmap.
> >>>>>>>> +                */
> >>>>>>>> +               return -EINVAL;
> >>>>>>>> +       }
> >>>>>>>> +
> >>>>>>>> +       mutex_lock(&kvm->lock);
> >>>>>>>> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> >>>>>>>> +       if (ret)
> >>>>>>>> +               goto unlock;
> >>>>>>>> +
> >>>>>>>> +       if (enc)
> >>>>>>>> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> >>>>>>>> +                               gfn_end - gfn_start);
> >>>>>>>> +       else
> >>>>>>>> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> >>>>>>>> +                               gfn_end - gfn_start);
> >>>>>>>> +
> >>>>>>>> +unlock:
> >>>>>>>> +       mutex_unlock(&kvm->lock);
> >>>>>>>> +       return ret;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> >>>>>>>>  {
> >>>>>>>>         struct kvm_sev_cmd sev_cmd;
> >>>>>>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> >>>>>>>>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> >>>>>>>>
> >>>>>>>>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> >>>>>>>> +
> >>>>>>>> +       .page_enc_status_hc = svm_page_enc_status_hc,
> >>>>>>>>  };
> >>>>>>>>
> >>>>>>>>  static int __init svm_init(void)
> >>>>>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> >>>>>>>> index 079d9fbf278e..f68e76ee7f9c 100644
> >>>>>>>> --- a/arch/x86/kvm/vmx/vmx.c
> >>>>>>>> +++ b/arch/x86/kvm/vmx/vmx.c
> >>>>>>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> >>>>>>>>         .nested_get_evmcs_version = NULL,
> >>>>>>>>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> >>>>>>>>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> >>>>>>>> +       .page_enc_status_hc = NULL,
> >>>>>>>>  };
> >>>>>>>>
> >>>>>>>>  static void vmx_cleanup_l1d_flush(void)
> >>>>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>>>>>>> index cf95c36cb4f4..68428eef2dde 100644
> >>>>>>>> --- a/arch/x86/kvm/x86.c
> >>>>>>>> +++ b/arch/x86/kvm/x86.c
> >>>>>>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >>>>>>>>                 kvm_sched_yield(vcpu->kvm, a0);
> >>>>>>>>                 ret = 0;
> >>>>>>>>                 break;
> >>>>>>>> +       case KVM_HC_PAGE_ENC_STATUS:
> >>>>>>>> +               ret = -KVM_ENOSYS;
> >>>>>>>> +               if (kvm_x86_ops->page_enc_status_hc)
> >>>>>>>> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> >>>>>>>> +                                       a0, a1, a2);
> >>>>>>>> +               break;
> >>>>>>>>         default:
> >>>>>>>>                 ret = -KVM_ENOSYS;
> >>>>>>>>                 break;
> >>>>>>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> >>>>>>>> index 8b86609849b9..847b83b75dc8 100644
> >>>>>>>> --- a/include/uapi/linux/kvm_para.h
> >>>>>>>> +++ b/include/uapi/linux/kvm_para.h
> >>>>>>>> @@ -29,6 +29,7 @@
> >>>>>>>>  #define KVM_HC_CLOCK_PAIRING           9
> >>>>>>>>  #define KVM_HC_SEND_IPI                10
> >>>>>>>>  #define KVM_HC_SCHED_YIELD             11
> >>>>>>>> +#define KVM_HC_PAGE_ENC_STATUS         12
> >>>>>>>>
> >>>>>>>>  /*
> >>>>>>>>   * hypercalls use architecture specific
> >>>>>>>> --
> >>>>>>>> 2.17.1
> >>>>>>>>
> >>>>>>> I'm still not excited by the dynamic resizing. I believe the guest
> >>>>>>> hypercall can be called in atomic contexts, which makes me
> >>>>>>> particularly unexcited to see a potentially large vmalloc on the host
> >>>>>>> followed by filling the buffer. Particularly when the buffer might be
> >>>>>>> non-trivial in size (~1MB per 32GB, per some back of the envelope
> >>>>>>> math).
> >>>>>>>
> >>>>>> I think looking at more practical situations, most hypercalls will
> >>>>>> happen during the boot stage, when device specific initializations are
> >>>>>> happening, so typically the maximum page encryption bitmap size would
> >>>>>> be allocated early enough.
> >>>>>>
> >>>>>> In fact, initial hypercalls made by OVMF will probably allocate the
> >>>>>> maximum page bitmap size even before the kernel comes up, especially
> >>>>>> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
> >>>>>> regions, PCI device memory, etc., and most importantly for
> >>>>>> "non-existent" high memory range (which will probably be the
> >>>>>> maximum size page encryption bitmap allocated/resized).
> >>>>>>
> >>>>>> Let me know if you have different thoughts on this ?
> >>>>> Hi Ashish,
> >>>>>
> >>>>> If this is not an issue in practice, we can just move past this. If we
> >>>>> are basically guaranteed that OVMF will trigger hypercalls that expand
> >>>>> the bitmap beyond the top of memory, then, yes, that should work. That
> >>>>> leaves me slightly nervous that OVMF might regress since it's not
> >>>>> obvious that calling a hypercall beyond the top of memory would be
> >>>>> "required" for avoiding a somewhat indirectly related issue in guest
> >>>>> kernels.
> >>>>
> >>>> If possible then we should try to avoid growing/shrinking the bitmap .
> >>>> Today OVMF may not be accessing beyond memory but a malicious guest
> >>>> could send a hypercall down which can trigger a huge memory allocation
> >>>> on the host side and may eventually cause denial of service for other.
> >>> Nice catch! Was just writing up an email about this.
> >>>> I am in favor if we can find some solution to handle this case. How
> >>>> about Steve's suggestion about VMM making a call down to the kernel to
> >>>> tell how big the bitmap should be? Initially it should be equal to the
> >>>> guest RAM and if VMM ever did the memory expansion then it can send down
> >>>> another notification to increase the bitmap ?
> >>>>
> >>>> Optionally, instead of adding a new ioctl, I was wondering if we can
> >>>> extend the kvm_arch_prepare_memory_region() to make svm specific x86_ops
> >>>> which can take read the userspace provided memory region and calculate
> >>>> the amount of guest RAM managed by the KVM and grow/shrink the bitmap
> >>>> based on that information. I have not looked deep enough to see if its
> >>>> doable but if it can work then we can avoid adding yet another ioctl.
> >>> We also have the set bitmap ioctl in a later patch in this series. We
> >>> could also use the set ioctl for initialization (it's a little
> >>> excessive for initialization since there will be an additional
> >>> ephemeral allocation and a few additional buffer copies, but that's
> >>> probably fine). An enable_cap has the added benefit of probably being
> >>> necessary anyway so usermode can disable the migration feature flag.
> >>>
> >>> In general, userspace is going to have to be in direct control of the
> >>> buffer and its size.
> >> My only practical concern about setting a static bitmap size based on guest
> >> memory is about the hypercalls being made initially by OVMF to set page
> >> enc/dec status for ROM, ACPI regions and especially the non-existent
> >> high memory range. The new ioctl will statically setup bitmap size to
> >> whatever guest RAM is specified, say 4G, 8G, etc., but the OVMF
> >> hypercall for non-existent memory will try to do a hypercall for guest
> >> physical memory range like ~6G->64G (for 4G guest RAM setup), this
> >> hypercall will basically have to just return doing nothing, because
> >> the allocated bitmap won't have this guest physical range available ?
> 
> 
> IMO, Ovmf issuing a hypercall beyond the guest RAM is simple wrong, it
> should *not* do that.  There was a feature request I submitted sometime
> back to Tianocore https://bugzilla.tianocore.org/show_bug.cgi?id=623 as
> I saw this coming in future. I tried highlighting the problem in the
> MdeModulePkg that it does not provide a notifier to tell OVMF when core
> creates the MMIO holes etc. It was not a big problem with the SEV
> initially because we were never getting down to hypervisor to do
> something about those non-existent regions. But with the migration its
> now important that we should restart the discussion with UEFI folks and
> see what can be done. In the kernel patches we should do what is right
> for the kernel and not workaround the Ovmf limitation.

Ok, this makes sense. I will start exploring
kvm_arch_prepare_memory_region() to see if it can assist in computing
the guest RAM or otherwise i will look at adding a new ioctl interface
for the same.

Thanks,
Ashish

> 
> 
> >> Also, hypercalls for ROM, ACPI, device regions and any memory holes within
> >> the static bitmap setup as per guest RAM config will work, but what
> >> about hypercalls for any device regions beyond the guest RAM config ?
> >>
> >> Thanks,
> >> Ashish
> > I'm not super familiar with what the address beyond the top of ram is
> > used for. If the memory is not backed by RAM, will it even matter for
> > migration? Sounds like the encryption for SEV won't even apply to it.
> > If we don't need to know what the c-bit state of an address is, we
> > don't need to track it. It doesn't hurt to track it (which is why I'm
> > not super concerned about tracking the memory holes).
Kalra, Ashish April 9, 2020, 4:18 p.m. UTC | #15
Hello Brijesh, Steve,

On Wed, Apr 08, 2020 at 03:18:18AM +0000, Ashish Kalra wrote:
> Hello Brijesh,
> 
> On Tue, Apr 07, 2020 at 09:34:15PM -0500, Brijesh Singh wrote:
> > 
> > On 4/7/20 8:38 PM, Steve Rutherford wrote:
> > > On Tue, Apr 7, 2020 at 6:17 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> > >> Hello Steve, Brijesh,
> > >>
> > >> On Tue, Apr 07, 2020 at 05:35:57PM -0700, Steve Rutherford wrote:
> > >>> On Tue, Apr 7, 2020 at 5:29 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
> > >>>>
> > >>>> On 4/7/20 7:01 PM, Steve Rutherford wrote:
> > >>>>> On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> > >>>>>> Hello Steve,
> > >>>>>>
> > >>>>>> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
> > >>>>>>> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> > >>>>>>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
> > >>>>>>>>
> > >>>>>>>> This hypercall is used by the SEV guest to notify a change in the page
> > >>>>>>>> encryption status to the hypervisor. The hypercall should be invoked
> > >>>>>>>> only when the encryption attribute is changed from encrypted -> decrypted
> > >>>>>>>> and vice versa. By default all guest pages are considered encrypted.
> > >>>>>>>>
> > >>>>>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
> > >>>>>>>> Cc: Ingo Molnar <mingo@redhat.com>
> > >>>>>>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
> > >>>>>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> > >>>>>>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > >>>>>>>> Cc: Joerg Roedel <joro@8bytes.org>
> > >>>>>>>> Cc: Borislav Petkov <bp@suse.de>
> > >>>>>>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > >>>>>>>> Cc: x86@kernel.org
> > >>>>>>>> Cc: kvm@vger.kernel.org
> > >>>>>>>> Cc: linux-kernel@vger.kernel.org
> > >>>>>>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > >>>>>>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > >>>>>>>> ---
> > >>>>>>>>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
> > >>>>>>>>  arch/x86/include/asm/kvm_host.h       |  2 +
> > >>>>>>>>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> > >>>>>>>>  arch/x86/kvm/vmx/vmx.c                |  1 +
> > >>>>>>>>  arch/x86/kvm/x86.c                    |  6 ++
> > >>>>>>>>  include/uapi/linux/kvm_para.h         |  1 +
> > >>>>>>>>  6 files changed, 120 insertions(+)
> > >>>>>>>>
> > >>>>>>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > >>>>>>>> index dbaf207e560d..ff5287e68e81 100644
> > >>>>>>>> --- a/Documentation/virt/kvm/hypercalls.rst
> > >>>>>>>> +++ b/Documentation/virt/kvm/hypercalls.rst
> > >>>>>>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
> > >>>>>>>>
> > >>>>>>>>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> > >>>>>>>>                 any of the IPI target vCPUs was preempted.
> > >>>>>>>> +
> > >>>>>>>> +
> > >>>>>>>> +8. KVM_HC_PAGE_ENC_STATUS
> > >>>>>>>> +-------------------------
> > >>>>>>>> +:Architecture: x86
> > >>>>>>>> +:Status: active
> > >>>>>>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > >>>>>>>> +
> > >>>>>>>> +a0: the guest physical address of the start page
> > >>>>>>>> +a1: the number of pages
> > >>>>>>>> +a2: encryption attribute
> > >>>>>>>> +
> > >>>>>>>> +   Where:
> > >>>>>>>> +       * 1: Encryption attribute is set
> > >>>>>>>> +       * 0: Encryption attribute is cleared
> > >>>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > >>>>>>>> index 98959e8cd448..90718fa3db47 100644
> > >>>>>>>> --- a/arch/x86/include/asm/kvm_host.h
> > >>>>>>>> +++ b/arch/x86/include/asm/kvm_host.h
> > >>>>>>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> > >>>>>>>>
> > >>>>>>>>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> > >>>>>>>>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> > >>>>>>>> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > >>>>>>>> +                                 unsigned long sz, unsigned long mode);
> > >>>>>>> Nit: spell out size instead of sz.
> > >>>>>>>>  };
> > >>>>>>>>
> > >>>>>>>>  struct kvm_arch_async_pf {
> > >>>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > >>>>>>>> index 7c2721e18b06..1d8beaf1bceb 100644
> > >>>>>>>> --- a/arch/x86/kvm/svm.c
> > >>>>>>>> +++ b/arch/x86/kvm/svm.c
> > >>>>>>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
> > >>>>>>>>         int fd;                 /* SEV device fd */
> > >>>>>>>>         unsigned long pages_locked; /* Number of pages locked */
> > >>>>>>>>         struct list_head regions_list;  /* List of registered regions */
> > >>>>>>>> +       unsigned long *page_enc_bmap;
> > >>>>>>>> +       unsigned long page_enc_bmap_size;
> > >>>>>>>>  };
> > >>>>>>>>
> > >>>>>>>>  struct kvm_svm {
> > >>>>>>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> > >>>>>>>>
> > >>>>>>>>         sev_unbind_asid(kvm, sev->handle);
> > >>>>>>>>         sev_asid_free(sev->asid);
> > >>>>>>>> +
> > >>>>>>>> +       kvfree(sev->page_enc_bmap);
> > >>>>>>>> +       sev->page_enc_bmap = NULL;
> > >>>>>>>>  }
> > >>>>>>>>
> > >>>>>>>>  static void avic_vm_destroy(struct kvm *kvm)
> > >>>>>>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> > >>>>>>>>         return ret;
> > >>>>>>>>  }
> > >>>>>>>>
> > >>>>>>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> > >>>>>>>> +{
> > >>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > >>>>>>>> +       unsigned long *map;
> > >>>>>>>> +       unsigned long sz;
> > >>>>>>>> +
> > >>>>>>>> +       if (sev->page_enc_bmap_size >= new_size)
> > >>>>>>>> +               return 0;
> > >>>>>>>> +
> > >>>>>>>> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> > >>>>>>>> +
> > >>>>>>>> +       map = vmalloc(sz);
> > >>>>>>>> +       if (!map) {
> > >>>>>>>> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> > >>>>>>>> +                               sz);
> > >>>>>>>> +               return -ENOMEM;
> > >>>>>>>> +       }
> > >>>>>>>> +
> > >>>>>>>> +       /* mark the page encrypted (by default) */
> > >>>>>>>> +       memset(map, 0xff, sz);
> > >>>>>>>> +
> > >>>>>>>> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> > >>>>>>>> +       kvfree(sev->page_enc_bmap);
> > >>>>>>>> +
> > >>>>>>>> +       sev->page_enc_bmap = map;
> > >>>>>>>> +       sev->page_enc_bmap_size = new_size;
> > >>>>>>>> +
> > >>>>>>>> +       return 0;
> > >>>>>>>> +}
> > >>>>>>>> +
> > >>>>>>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> > >>>>>>>> +                                 unsigned long npages, unsigned long enc)
> > >>>>>>>> +{
> > >>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > >>>>>>>> +       kvm_pfn_t pfn_start, pfn_end;
> > >>>>>>>> +       gfn_t gfn_start, gfn_end;
> > >>>>>>>> +       int ret;
> > >>>>>>>> +
> > >>>>>>>> +       if (!sev_guest(kvm))
> > >>>>>>>> +               return -EINVAL;
> > >>>>>>>> +
> > >>>>>>>> +       if (!npages)
> > >>>>>>>> +               return 0;
> > >>>>>>>> +
> > >>>>>>>> +       gfn_start = gpa_to_gfn(gpa);
> > >>>>>>>> +       gfn_end = gfn_start + npages;
> > >>>>>>>> +
> > >>>>>>>> +       /* out of bound access error check */
> > >>>>>>>> +       if (gfn_end <= gfn_start)
> > >>>>>>>> +               return -EINVAL;
> > >>>>>>>> +
> > >>>>>>>> +       /* lets make sure that gpa exist in our memslot */
> > >>>>>>>> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> > >>>>>>>> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> > >>>>>>>> +
> > >>>>>>>> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> > >>>>>>>> +               /*
> > >>>>>>>> +                * Allow guest MMIO range(s) to be added
> > >>>>>>>> +                * to the page encryption bitmap.
> > >>>>>>>> +                */
> > >>>>>>>> +               return -EINVAL;
> > >>>>>>>> +       }
> > >>>>>>>> +
> > >>>>>>>> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> > >>>>>>>> +               /*
> > >>>>>>>> +                * Allow guest MMIO range(s) to be added
> > >>>>>>>> +                * to the page encryption bitmap.
> > >>>>>>>> +                */
> > >>>>>>>> +               return -EINVAL;
> > >>>>>>>> +       }
> > >>>>>>>> +
> > >>>>>>>> +       mutex_lock(&kvm->lock);
> > >>>>>>>> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> > >>>>>>>> +       if (ret)
> > >>>>>>>> +               goto unlock;
> > >>>>>>>> +
> > >>>>>>>> +       if (enc)
> > >>>>>>>> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> > >>>>>>>> +                               gfn_end - gfn_start);
> > >>>>>>>> +       else
> > >>>>>>>> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> > >>>>>>>> +                               gfn_end - gfn_start);
> > >>>>>>>> +
> > >>>>>>>> +unlock:
> > >>>>>>>> +       mutex_unlock(&kvm->lock);
> > >>>>>>>> +       return ret;
> > >>>>>>>> +}
> > >>>>>>>> +
> > >>>>>>>>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> > >>>>>>>>  {
> > >>>>>>>>         struct kvm_sev_cmd sev_cmd;
> > >>>>>>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> > >>>>>>>>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> > >>>>>>>>
> > >>>>>>>>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> > >>>>>>>> +
> > >>>>>>>> +       .page_enc_status_hc = svm_page_enc_status_hc,
> > >>>>>>>>  };
> > >>>>>>>>
> > >>>>>>>>  static int __init svm_init(void)
> > >>>>>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > >>>>>>>> index 079d9fbf278e..f68e76ee7f9c 100644
> > >>>>>>>> --- a/arch/x86/kvm/vmx/vmx.c
> > >>>>>>>> +++ b/arch/x86/kvm/vmx/vmx.c
> > >>>>>>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> > >>>>>>>>         .nested_get_evmcs_version = NULL,
> > >>>>>>>>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> > >>>>>>>>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> > >>>>>>>> +       .page_enc_status_hc = NULL,
> > >>>>>>>>  };
> > >>>>>>>>
> > >>>>>>>>  static void vmx_cleanup_l1d_flush(void)
> > >>>>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > >>>>>>>> index cf95c36cb4f4..68428eef2dde 100644
> > >>>>>>>> --- a/arch/x86/kvm/x86.c
> > >>>>>>>> +++ b/arch/x86/kvm/x86.c
> > >>>>>>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> > >>>>>>>>                 kvm_sched_yield(vcpu->kvm, a0);
> > >>>>>>>>                 ret = 0;
> > >>>>>>>>                 break;
> > >>>>>>>> +       case KVM_HC_PAGE_ENC_STATUS:
> > >>>>>>>> +               ret = -KVM_ENOSYS;
> > >>>>>>>> +               if (kvm_x86_ops->page_enc_status_hc)
> > >>>>>>>> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> > >>>>>>>> +                                       a0, a1, a2);
> > >>>>>>>> +               break;
> > >>>>>>>>         default:
> > >>>>>>>>                 ret = -KVM_ENOSYS;
> > >>>>>>>>                 break;
> > >>>>>>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> > >>>>>>>> index 8b86609849b9..847b83b75dc8 100644
> > >>>>>>>> --- a/include/uapi/linux/kvm_para.h
> > >>>>>>>> +++ b/include/uapi/linux/kvm_para.h
> > >>>>>>>> @@ -29,6 +29,7 @@
> > >>>>>>>>  #define KVM_HC_CLOCK_PAIRING           9
> > >>>>>>>>  #define KVM_HC_SEND_IPI                10
> > >>>>>>>>  #define KVM_HC_SCHED_YIELD             11
> > >>>>>>>> +#define KVM_HC_PAGE_ENC_STATUS         12
> > >>>>>>>>
> > >>>>>>>>  /*
> > >>>>>>>>   * hypercalls use architecture specific
> > >>>>>>>> --
> > >>>>>>>> 2.17.1
> > >>>>>>>>
> > >>>>>>> I'm still not excited by the dynamic resizing. I believe the guest
> > >>>>>>> hypercall can be called in atomic contexts, which makes me
> > >>>>>>> particularly unexcited to see a potentially large vmalloc on the host
> > >>>>>>> followed by filling the buffer. Particularly when the buffer might be
> > >>>>>>> non-trivial in size (~1MB per 32GB, per some back of the envelope
> > >>>>>>> math).
> > >>>>>>>
> > >>>>>> I think looking at more practical situations, most hypercalls will
> > >>>>>> happen during the boot stage, when device specific initializations are
> > >>>>>> happening, so typically the maximum page encryption bitmap size would
> > >>>>>> be allocated early enough.
> > >>>>>>
> > >>>>>> In fact, initial hypercalls made by OVMF will probably allocate the
> > >>>>>> maximum page bitmap size even before the kernel comes up, especially
> > >>>>>> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
> > >>>>>> regions, PCI device memory, etc., and most importantly for
> > >>>>>> "non-existent" high memory range (which will probably be the
> > >>>>>> maximum size page encryption bitmap allocated/resized).
> > >>>>>>
> > >>>>>> Let me know if you have different thoughts on this ?
> > >>>>> Hi Ashish,
> > >>>>>
> > >>>>> If this is not an issue in practice, we can just move past this. If we
> > >>>>> are basically guaranteed that OVMF will trigger hypercalls that expand
> > >>>>> the bitmap beyond the top of memory, then, yes, that should work. That
> > >>>>> leaves me slightly nervous that OVMF might regress since it's not
> > >>>>> obvious that calling a hypercall beyond the top of memory would be
> > >>>>> "required" for avoiding a somewhat indirectly related issue in guest
> > >>>>> kernels.
> > >>>>
> > >>>> If possible then we should try to avoid growing/shrinking the bitmap .
> > >>>> Today OVMF may not be accessing beyond memory but a malicious guest
> > >>>> could send a hypercall down which can trigger a huge memory allocation
> > >>>> on the host side and may eventually cause denial of service for other.
> > >>> Nice catch! Was just writing up an email about this.
> > >>>> I am in favor if we can find some solution to handle this case. How
> > >>>> about Steve's suggestion about VMM making a call down to the kernel to
> > >>>> tell how big the bitmap should be? Initially it should be equal to the
> > >>>> guest RAM and if VMM ever did the memory expansion then it can send down
> > >>>> another notification to increase the bitmap ?
> > >>>>
> > >>>> Optionally, instead of adding a new ioctl, I was wondering if we can
> > >>>> extend the kvm_arch_prepare_memory_region() to make svm specific x86_ops
> > >>>> which can take read the userspace provided memory region and calculate
> > >>>> the amount of guest RAM managed by the KVM and grow/shrink the bitmap
> > >>>> based on that information. I have not looked deep enough to see if its
> > >>>> doable but if it can work then we can avoid adding yet another ioctl.
> > >>> We also have the set bitmap ioctl in a later patch in this series. We
> > >>> could also use the set ioctl for initialization (it's a little
> > >>> excessive for initialization since there will be an additional
> > >>> ephemeral allocation and a few additional buffer copies, but that's
> > >>> probably fine). An enable_cap has the added benefit of probably being
> > >>> necessary anyway so usermode can disable the migration feature flag.
> > >>>
> > >>> In general, userspace is going to have to be in direct control of the
> > >>> buffer and its size.
> > >> My only practical concern about setting a static bitmap size based on guest
> > >> memory is about the hypercalls being made initially by OVMF to set page
> > >> enc/dec status for ROM, ACPI regions and especially the non-existent
> > >> high memory range. The new ioctl will statically setup bitmap size to
> > >> whatever guest RAM is specified, say 4G, 8G, etc., but the OVMF
> > >> hypercall for non-existent memory will try to do a hypercall for guest
> > >> physical memory range like ~6G->64G (for 4G guest RAM setup), this
> > >> hypercall will basically have to just return doing nothing, because
> > >> the allocated bitmap won't have this guest physical range available ?
> > 
> > 
> > IMO, Ovmf issuing a hypercall beyond the guest RAM is simple wrong, it
> > should *not* do that.  There was a feature request I submitted sometime
> > back to Tianocore https://bugzilla.tianocore.org/show_bug.cgi?id=623 as
> > I saw this coming in future. I tried highlighting the problem in the
> > MdeModulePkg that it does not provide a notifier to tell OVMF when core
> > creates the MMIO holes etc. It was not a big problem with the SEV
> > initially because we were never getting down to hypervisor to do
> > something about those non-existent regions. But with the migration its
> > now important that we should restart the discussion with UEFI folks and
> > see what can be done. In the kernel patches we should do what is right
> > for the kernel and not workaround the Ovmf limitation.
> 
> Ok, this makes sense. I will start exploring
> kvm_arch_prepare_memory_region() to see if it can assist in computing
> the guest RAM or otherwise i will look at adding a new ioctl interface
> for the same.
> 

I looked at kvm_arch_prepare_memory_region() and
kvm_arch_commit_memory_region() and kvm_arch_commit_memory_region()
looks to be ideal to use for this.

I walked the kvm_memslots in this function and i can compute the
approximate guest RAM mapped by KVM, though, i get the guest RAM size as
"twice" the configured size because of the two address spaces on x86 KVM,
i believe there is one additional address space for SMM/SMRAM use.

I don't think we have a use case of migrating a SEV guest with SMM
support ?

Considering that, i believe that i can just compute the guest RAM size
using memslots for address space #0 and use that to grow/shrink the bitmap. 

As you mentioned i will need to add a new SVM specific x86_ops to
callback as part of kvm_arch_commit_memory_region() which will in-turn
call sev_resize_page_enc_bitmap().

Thanks,
Ashish

> > 
> > 
> > >> Also, hypercalls for ROM, ACPI, device regions and any memory holes within
> > >> the static bitmap setup as per guest RAM config will work, but what
> > >> about hypercalls for any device regions beyond the guest RAM config ?
> > >>
> > >> Thanks,
> > >> Ashish
> > > I'm not super familiar with what the address beyond the top of ram is
> > > used for. If the memory is not backed by RAM, will it even matter for
> > > migration? Sounds like the encryption for SEV won't even apply to it.
> > > If we don't need to know what the c-bit state of an address is, we
> > > don't need to track it. It doesn't hurt to track it (which is why I'm
> > > not super concerned about tracking the memory holes).
Steve Rutherford April 9, 2020, 8:41 p.m. UTC | #16
On Thu, Apr 9, 2020 at 9:18 AM Ashish Kalra <ashish.kalra@amd.com> wrote:
>
> Hello Brijesh, Steve,
>
> On Wed, Apr 08, 2020 at 03:18:18AM +0000, Ashish Kalra wrote:
> > Hello Brijesh,
> >
> > On Tue, Apr 07, 2020 at 09:34:15PM -0500, Brijesh Singh wrote:
> > >
> > > On 4/7/20 8:38 PM, Steve Rutherford wrote:
> > > > On Tue, Apr 7, 2020 at 6:17 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> > > >> Hello Steve, Brijesh,
> > > >>
> > > >> On Tue, Apr 07, 2020 at 05:35:57PM -0700, Steve Rutherford wrote:
> > > >>> On Tue, Apr 7, 2020 at 5:29 PM Brijesh Singh <brijesh.singh@amd.com> wrote:
> > > >>>>
> > > >>>> On 4/7/20 7:01 PM, Steve Rutherford wrote:
> > > >>>>> On Mon, Apr 6, 2020 at 10:27 PM Ashish Kalra <ashish.kalra@amd.com> wrote:
> > > >>>>>> Hello Steve,
> > > >>>>>>
> > > >>>>>> On Mon, Apr 06, 2020 at 07:17:37PM -0700, Steve Rutherford wrote:
> > > >>>>>>> On Sun, Mar 29, 2020 at 11:22 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> > > >>>>>>>> From: Brijesh Singh <Brijesh.Singh@amd.com>
> > > >>>>>>>>
> > > >>>>>>>> This hypercall is used by the SEV guest to notify a change in the page
> > > >>>>>>>> encryption status to the hypervisor. The hypercall should be invoked
> > > >>>>>>>> only when the encryption attribute is changed from encrypted -> decrypted
> > > >>>>>>>> and vice versa. By default all guest pages are considered encrypted.
> > > >>>>>>>>
> > > >>>>>>>> Cc: Thomas Gleixner <tglx@linutronix.de>
> > > >>>>>>>> Cc: Ingo Molnar <mingo@redhat.com>
> > > >>>>>>>> Cc: "H. Peter Anvin" <hpa@zytor.com>
> > > >>>>>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> > > >>>>>>>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > > >>>>>>>> Cc: Joerg Roedel <joro@8bytes.org>
> > > >>>>>>>> Cc: Borislav Petkov <bp@suse.de>
> > > >>>>>>>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > > >>>>>>>> Cc: x86@kernel.org
> > > >>>>>>>> Cc: kvm@vger.kernel.org
> > > >>>>>>>> Cc: linux-kernel@vger.kernel.org
> > > >>>>>>>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > > >>>>>>>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > > >>>>>>>> ---
> > > >>>>>>>>  Documentation/virt/kvm/hypercalls.rst | 15 +++++
> > > >>>>>>>>  arch/x86/include/asm/kvm_host.h       |  2 +
> > > >>>>>>>>  arch/x86/kvm/svm.c                    | 95 +++++++++++++++++++++++++++
> > > >>>>>>>>  arch/x86/kvm/vmx/vmx.c                |  1 +
> > > >>>>>>>>  arch/x86/kvm/x86.c                    |  6 ++
> > > >>>>>>>>  include/uapi/linux/kvm_para.h         |  1 +
> > > >>>>>>>>  6 files changed, 120 insertions(+)
> > > >>>>>>>>
> > > >>>>>>>> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > > >>>>>>>> index dbaf207e560d..ff5287e68e81 100644
> > > >>>>>>>> --- a/Documentation/virt/kvm/hypercalls.rst
> > > >>>>>>>> +++ b/Documentation/virt/kvm/hypercalls.rst
> > > >>>>>>>> @@ -169,3 +169,18 @@ a0: destination APIC ID
> > > >>>>>>>>
> > > >>>>>>>>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> > > >>>>>>>>                 any of the IPI target vCPUs was preempted.
> > > >>>>>>>> +
> > > >>>>>>>> +
> > > >>>>>>>> +8. KVM_HC_PAGE_ENC_STATUS
> > > >>>>>>>> +-------------------------
> > > >>>>>>>> +:Architecture: x86
> > > >>>>>>>> +:Status: active
> > > >>>>>>>> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > > >>>>>>>> +
> > > >>>>>>>> +a0: the guest physical address of the start page
> > > >>>>>>>> +a1: the number of pages
> > > >>>>>>>> +a2: encryption attribute
> > > >>>>>>>> +
> > > >>>>>>>> +   Where:
> > > >>>>>>>> +       * 1: Encryption attribute is set
> > > >>>>>>>> +       * 0: Encryption attribute is cleared
> > > >>>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > >>>>>>>> index 98959e8cd448..90718fa3db47 100644
> > > >>>>>>>> --- a/arch/x86/include/asm/kvm_host.h
> > > >>>>>>>> +++ b/arch/x86/include/asm/kvm_host.h
> > > >>>>>>>> @@ -1267,6 +1267,8 @@ struct kvm_x86_ops {
> > > >>>>>>>>
> > > >>>>>>>>         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> > > >>>>>>>>         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> > > >>>>>>>> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > > >>>>>>>> +                                 unsigned long sz, unsigned long mode);
> > > >>>>>>> Nit: spell out size instead of sz.
> > > >>>>>>>>  };
> > > >>>>>>>>
> > > >>>>>>>>  struct kvm_arch_async_pf {
> > > >>>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > > >>>>>>>> index 7c2721e18b06..1d8beaf1bceb 100644
> > > >>>>>>>> --- a/arch/x86/kvm/svm.c
> > > >>>>>>>> +++ b/arch/x86/kvm/svm.c
> > > >>>>>>>> @@ -136,6 +136,8 @@ struct kvm_sev_info {
> > > >>>>>>>>         int fd;                 /* SEV device fd */
> > > >>>>>>>>         unsigned long pages_locked; /* Number of pages locked */
> > > >>>>>>>>         struct list_head regions_list;  /* List of registered regions */
> > > >>>>>>>> +       unsigned long *page_enc_bmap;
> > > >>>>>>>> +       unsigned long page_enc_bmap_size;
> > > >>>>>>>>  };
> > > >>>>>>>>
> > > >>>>>>>>  struct kvm_svm {
> > > >>>>>>>> @@ -1991,6 +1993,9 @@ static void sev_vm_destroy(struct kvm *kvm)
> > > >>>>>>>>
> > > >>>>>>>>         sev_unbind_asid(kvm, sev->handle);
> > > >>>>>>>>         sev_asid_free(sev->asid);
> > > >>>>>>>> +
> > > >>>>>>>> +       kvfree(sev->page_enc_bmap);
> > > >>>>>>>> +       sev->page_enc_bmap = NULL;
> > > >>>>>>>>  }
> > > >>>>>>>>
> > > >>>>>>>>  static void avic_vm_destroy(struct kvm *kvm)
> > > >>>>>>>> @@ -7593,6 +7598,94 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> > > >>>>>>>>         return ret;
> > > >>>>>>>>  }
> > > >>>>>>>>
> > > >>>>>>>> +static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
> > > >>>>>>>> +{
> > > >>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > > >>>>>>>> +       unsigned long *map;
> > > >>>>>>>> +       unsigned long sz;
> > > >>>>>>>> +
> > > >>>>>>>> +       if (sev->page_enc_bmap_size >= new_size)
> > > >>>>>>>> +               return 0;
> > > >>>>>>>> +
> > > >>>>>>>> +       sz = ALIGN(new_size, BITS_PER_LONG) / 8;
> > > >>>>>>>> +
> > > >>>>>>>> +       map = vmalloc(sz);
> > > >>>>>>>> +       if (!map) {
> > > >>>>>>>> +               pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
> > > >>>>>>>> +                               sz);
> > > >>>>>>>> +               return -ENOMEM;
> > > >>>>>>>> +       }
> > > >>>>>>>> +
> > > >>>>>>>> +       /* mark the page encrypted (by default) */
> > > >>>>>>>> +       memset(map, 0xff, sz);
> > > >>>>>>>> +
> > > >>>>>>>> +       bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
> > > >>>>>>>> +       kvfree(sev->page_enc_bmap);
> > > >>>>>>>> +
> > > >>>>>>>> +       sev->page_enc_bmap = map;
> > > >>>>>>>> +       sev->page_enc_bmap_size = new_size;
> > > >>>>>>>> +
> > > >>>>>>>> +       return 0;
> > > >>>>>>>> +}
> > > >>>>>>>> +
> > > >>>>>>>> +static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> > > >>>>>>>> +                                 unsigned long npages, unsigned long enc)
> > > >>>>>>>> +{
> > > >>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> > > >>>>>>>> +       kvm_pfn_t pfn_start, pfn_end;
> > > >>>>>>>> +       gfn_t gfn_start, gfn_end;
> > > >>>>>>>> +       int ret;
> > > >>>>>>>> +
> > > >>>>>>>> +       if (!sev_guest(kvm))
> > > >>>>>>>> +               return -EINVAL;
> > > >>>>>>>> +
> > > >>>>>>>> +       if (!npages)
> > > >>>>>>>> +               return 0;
> > > >>>>>>>> +
> > > >>>>>>>> +       gfn_start = gpa_to_gfn(gpa);
> > > >>>>>>>> +       gfn_end = gfn_start + npages;
> > > >>>>>>>> +
> > > >>>>>>>> +       /* out of bound access error check */
> > > >>>>>>>> +       if (gfn_end <= gfn_start)
> > > >>>>>>>> +               return -EINVAL;
> > > >>>>>>>> +
> > > >>>>>>>> +       /* lets make sure that gpa exist in our memslot */
> > > >>>>>>>> +       pfn_start = gfn_to_pfn(kvm, gfn_start);
> > > >>>>>>>> +       pfn_end = gfn_to_pfn(kvm, gfn_end);
> > > >>>>>>>> +
> > > >>>>>>>> +       if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
> > > >>>>>>>> +               /*
> > > >>>>>>>> +                * Allow guest MMIO range(s) to be added
> > > >>>>>>>> +                * to the page encryption bitmap.
> > > >>>>>>>> +                */
> > > >>>>>>>> +               return -EINVAL;
> > > >>>>>>>> +       }
> > > >>>>>>>> +
> > > >>>>>>>> +       if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
> > > >>>>>>>> +               /*
> > > >>>>>>>> +                * Allow guest MMIO range(s) to be added
> > > >>>>>>>> +                * to the page encryption bitmap.
> > > >>>>>>>> +                */
> > > >>>>>>>> +               return -EINVAL;
> > > >>>>>>>> +       }
> > > >>>>>>>> +
> > > >>>>>>>> +       mutex_lock(&kvm->lock);
> > > >>>>>>>> +       ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
> > > >>>>>>>> +       if (ret)
> > > >>>>>>>> +               goto unlock;
> > > >>>>>>>> +
> > > >>>>>>>> +       if (enc)
> > > >>>>>>>> +               __bitmap_set(sev->page_enc_bmap, gfn_start,
> > > >>>>>>>> +                               gfn_end - gfn_start);
> > > >>>>>>>> +       else
> > > >>>>>>>> +               __bitmap_clear(sev->page_enc_bmap, gfn_start,
> > > >>>>>>>> +                               gfn_end - gfn_start);
> > > >>>>>>>> +
> > > >>>>>>>> +unlock:
> > > >>>>>>>> +       mutex_unlock(&kvm->lock);
> > > >>>>>>>> +       return ret;
> > > >>>>>>>> +}
> > > >>>>>>>> +
> > > >>>>>>>>  static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
> > > >>>>>>>>  {
> > > >>>>>>>>         struct kvm_sev_cmd sev_cmd;
> > > >>>>>>>> @@ -7995,6 +8088,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
> > > >>>>>>>>         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
> > > >>>>>>>>
> > > >>>>>>>>         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
> > > >>>>>>>> +
> > > >>>>>>>> +       .page_enc_status_hc = svm_page_enc_status_hc,
> > > >>>>>>>>  };
> > > >>>>>>>>
> > > >>>>>>>>  static int __init svm_init(void)
> > > >>>>>>>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > > >>>>>>>> index 079d9fbf278e..f68e76ee7f9c 100644
> > > >>>>>>>> --- a/arch/x86/kvm/vmx/vmx.c
> > > >>>>>>>> +++ b/arch/x86/kvm/vmx/vmx.c
> > > >>>>>>>> @@ -8001,6 +8001,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> > > >>>>>>>>         .nested_get_evmcs_version = NULL,
> > > >>>>>>>>         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
> > > >>>>>>>>         .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
> > > >>>>>>>> +       .page_enc_status_hc = NULL,
> > > >>>>>>>>  };
> > > >>>>>>>>
> > > >>>>>>>>  static void vmx_cleanup_l1d_flush(void)
> > > >>>>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > >>>>>>>> index cf95c36cb4f4..68428eef2dde 100644
> > > >>>>>>>> --- a/arch/x86/kvm/x86.c
> > > >>>>>>>> +++ b/arch/x86/kvm/x86.c
> > > >>>>>>>> @@ -7564,6 +7564,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> > > >>>>>>>>                 kvm_sched_yield(vcpu->kvm, a0);
> > > >>>>>>>>                 ret = 0;
> > > >>>>>>>>                 break;
> > > >>>>>>>> +       case KVM_HC_PAGE_ENC_STATUS:
> > > >>>>>>>> +               ret = -KVM_ENOSYS;
> > > >>>>>>>> +               if (kvm_x86_ops->page_enc_status_hc)
> > > >>>>>>>> +                       ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
> > > >>>>>>>> +                                       a0, a1, a2);
> > > >>>>>>>> +               break;
> > > >>>>>>>>         default:
> > > >>>>>>>>                 ret = -KVM_ENOSYS;
> > > >>>>>>>>                 break;
> > > >>>>>>>> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> > > >>>>>>>> index 8b86609849b9..847b83b75dc8 100644
> > > >>>>>>>> --- a/include/uapi/linux/kvm_para.h
> > > >>>>>>>> +++ b/include/uapi/linux/kvm_para.h
> > > >>>>>>>> @@ -29,6 +29,7 @@
> > > >>>>>>>>  #define KVM_HC_CLOCK_PAIRING           9
> > > >>>>>>>>  #define KVM_HC_SEND_IPI                10
> > > >>>>>>>>  #define KVM_HC_SCHED_YIELD             11
> > > >>>>>>>> +#define KVM_HC_PAGE_ENC_STATUS         12
> > > >>>>>>>>
> > > >>>>>>>>  /*
> > > >>>>>>>>   * hypercalls use architecture specific
> > > >>>>>>>> --
> > > >>>>>>>> 2.17.1
> > > >>>>>>>>
> > > >>>>>>> I'm still not excited by the dynamic resizing. I believe the guest
> > > >>>>>>> hypercall can be called in atomic contexts, which makes me
> > > >>>>>>> particularly unexcited to see a potentially large vmalloc on the host
> > > >>>>>>> followed by filling the buffer. Particularly when the buffer might be
> > > >>>>>>> non-trivial in size (~1MB per 32GB, per some back of the envelope
> > > >>>>>>> math).
> > > >>>>>>>
> > > >>>>>> I think looking at more practical situations, most hypercalls will
> > > >>>>>> happen during the boot stage, when device specific initializations are
> > > >>>>>> happening, so typically the maximum page encryption bitmap size would
> > > >>>>>> be allocated early enough.
> > > >>>>>>
> > > >>>>>> In fact, initial hypercalls made by OVMF will probably allocate the
> > > >>>>>> maximum page bitmap size even before the kernel comes up, especially
> > > >>>>>> as they will be setting up page enc/dec status for MMIO, ROM, ACPI
> > > >>>>>> regions, PCI device memory, etc., and most importantly for
> > > >>>>>> "non-existent" high memory range (which will probably be the
> > > >>>>>> maximum size page encryption bitmap allocated/resized).
> > > >>>>>>
> > > >>>>>> Let me know if you have different thoughts on this ?
> > > >>>>> Hi Ashish,
> > > >>>>>
> > > >>>>> If this is not an issue in practice, we can just move past this. If we
> > > >>>>> are basically guaranteed that OVMF will trigger hypercalls that expand
> > > >>>>> the bitmap beyond the top of memory, then, yes, that should work. That
> > > >>>>> leaves me slightly nervous that OVMF might regress since it's not
> > > >>>>> obvious that calling a hypercall beyond the top of memory would be
> > > >>>>> "required" for avoiding a somewhat indirectly related issue in guest
> > > >>>>> kernels.
> > > >>>>
> > > >>>> If possible then we should try to avoid growing/shrinking the bitmap .
> > > >>>> Today OVMF may not be accessing beyond memory but a malicious guest
> > > >>>> could send a hypercall down which can trigger a huge memory allocation
> > > >>>> on the host side and may eventually cause denial of service for other.
> > > >>> Nice catch! Was just writing up an email about this.
> > > >>>> I am in favor if we can find some solution to handle this case. How
> > > >>>> about Steve's suggestion about VMM making a call down to the kernel to
> > > >>>> tell how big the bitmap should be? Initially it should be equal to the
> > > >>>> guest RAM and if VMM ever did the memory expansion then it can send down
> > > >>>> another notification to increase the bitmap ?
> > > >>>>
> > > >>>> Optionally, instead of adding a new ioctl, I was wondering if we can
> > > >>>> extend the kvm_arch_prepare_memory_region() to make svm specific x86_ops
> > > >>>> which can take read the userspace provided memory region and calculate
> > > >>>> the amount of guest RAM managed by the KVM and grow/shrink the bitmap
> > > >>>> based on that information. I have not looked deep enough to see if its
> > > >>>> doable but if it can work then we can avoid adding yet another ioctl.
> > > >>> We also have the set bitmap ioctl in a later patch in this series. We
> > > >>> could also use the set ioctl for initialization (it's a little
> > > >>> excessive for initialization since there will be an additional
> > > >>> ephemeral allocation and a few additional buffer copies, but that's
> > > >>> probably fine). An enable_cap has the added benefit of probably being
> > > >>> necessary anyway so usermode can disable the migration feature flag.
> > > >>>
> > > >>> In general, userspace is going to have to be in direct control of the
> > > >>> buffer and its size.
> > > >> My only practical concern about setting a static bitmap size based on guest
> > > >> memory is about the hypercalls being made initially by OVMF to set page
> > > >> enc/dec status for ROM, ACPI regions and especially the non-existent
> > > >> high memory range. The new ioctl will statically setup bitmap size to
> > > >> whatever guest RAM is specified, say 4G, 8G, etc., but the OVMF
> > > >> hypercall for non-existent memory will try to do a hypercall for guest
> > > >> physical memory range like ~6G->64G (for 4G guest RAM setup), this
> > > >> hypercall will basically have to just return doing nothing, because
> > > >> the allocated bitmap won't have this guest physical range available ?
> > >
> > >
> > > IMO, Ovmf issuing a hypercall beyond the guest RAM is simple wrong, it
> > > should *not* do that.  There was a feature request I submitted sometime
> > > back to Tianocore https://bugzilla.tianocore.org/show_bug.cgi?id=623 as
> > > I saw this coming in future. I tried highlighting the problem in the
> > > MdeModulePkg that it does not provide a notifier to tell OVMF when core
> > > creates the MMIO holes etc. It was not a big problem with the SEV
> > > initially because we were never getting down to hypervisor to do
> > > something about those non-existent regions. But with the migration its
> > > now important that we should restart the discussion with UEFI folks and
> > > see what can be done. In the kernel patches we should do what is right
> > > for the kernel and not workaround the Ovmf limitation.
> >
> > Ok, this makes sense. I will start exploring
> > kvm_arch_prepare_memory_region() to see if it can assist in computing
> > the guest RAM or otherwise i will look at adding a new ioctl interface
> > for the same.
> >
>
> I looked at kvm_arch_prepare_memory_region() and
> kvm_arch_commit_memory_region() and kvm_arch_commit_memory_region()
> looks to be ideal to use for this.
>
> I walked the kvm_memslots in this function and i can compute the
> approximate guest RAM mapped by KVM, though, i get the guest RAM size as
> "twice" the configured size because of the two address spaces on x86 KVM,
> i believe there is one additional address space for SMM/SMRAM use.
>
> I don't think we have a use case of migrating a SEV guest with SMM
> support ?
>
> Considering that, i believe that i can just compute the guest RAM size
> using memslots for address space #0 and use that to grow/shrink the bitmap.
>
> As you mentioned i will need to add a new SVM specific x86_ops to
> callback as part of kvm_arch_commit_memory_region() which will in-turn
> call sev_resize_page_enc_bitmap().
>
> Thanks,
> Ashish
>
> > >
> > >
> > > >> Also, hypercalls for ROM, ACPI, device regions and any memory holes within
> > > >> the static bitmap setup as per guest RAM config will work, but what
> > > >> about hypercalls for any device regions beyond the guest RAM config ?
> > > >>
> > > >> Thanks,
> > > >> Ashish
Supporting Migration of SEV VM with SMM seems unnecessary, but I
wouldn't go out of my way to make it harder to fix. The bitmap is
as_id unaware already (as it likely should be, since the PA space is
shared), so I would personally ignore summing up the sizes, and
instead look for the highest PA that is mapped by a memslot. If I'm
not mistaken, the address spaces should (more or less) entirely
overlap. This will be suboptimal if the PA space is sparse, but that
would be weird anyway.

You could also go back to a suggestion I had much earlier and have the
memslots own the bitmaps. I believe there is space to add flags for
enabling and disabling features like this on memslots. Basically, you
could treat this in the same way as dirty bitmaps, which seem pretty
similar. This would work well even if PA space were sparse.

The current patch set is nowhere near sufficient for supporting SMM
migration securely, which is fine: we would need to separate out
access to the hypercall for particular pages, so that non-SMM could
not corrupt SMM. And then the code in SMM would also need to support
the hypercall if it ever used shared pages. Until someone asks for
this I don't believe it is worth doing. If we put the bitmaps on the
memslots, we could probably limit access to a memslot's bitmap to
vcpus that can access that memslot. This seems unnecessary for now.

--Steve
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
index dbaf207e560d..ff5287e68e81 100644
--- a/Documentation/virt/kvm/hypercalls.rst
+++ b/Documentation/virt/kvm/hypercalls.rst
@@ -169,3 +169,18 @@  a0: destination APIC ID
 
 :Usage example: When sending a call-function IPI-many to vCPUs, yield if
 	        any of the IPI target vCPUs was preempted.
+
+
+8. KVM_HC_PAGE_ENC_STATUS
+-------------------------
+:Architecture: x86
+:Status: active
+:Purpose: Notify the encryption status changes in guest page table (SEV guest)
+
+a0: the guest physical address of the start page
+a1: the number of pages
+a2: encryption attribute
+
+   Where:
+	* 1: Encryption attribute is set
+	* 0: Encryption attribute is cleared
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 98959e8cd448..90718fa3db47 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1267,6 +1267,8 @@  struct kvm_x86_ops {
 
 	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
 	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
+	int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
+				  unsigned long sz, unsigned long mode);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7c2721e18b06..1d8beaf1bceb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -136,6 +136,8 @@  struct kvm_sev_info {
 	int fd;			/* SEV device fd */
 	unsigned long pages_locked; /* Number of pages locked */
 	struct list_head regions_list;  /* List of registered regions */
+	unsigned long *page_enc_bmap;
+	unsigned long page_enc_bmap_size;
 };
 
 struct kvm_svm {
@@ -1991,6 +1993,9 @@  static void sev_vm_destroy(struct kvm *kvm)
 
 	sev_unbind_asid(kvm, sev->handle);
 	sev_asid_free(sev->asid);
+
+	kvfree(sev->page_enc_bmap);
+	sev->page_enc_bmap = NULL;
 }
 
 static void avic_vm_destroy(struct kvm *kvm)
@@ -7593,6 +7598,94 @@  static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int sev_resize_page_enc_bitmap(struct kvm *kvm, unsigned long new_size)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	unsigned long *map;
+	unsigned long sz;
+
+	if (sev->page_enc_bmap_size >= new_size)
+		return 0;
+
+	sz = ALIGN(new_size, BITS_PER_LONG) / 8;
+
+	map = vmalloc(sz);
+	if (!map) {
+		pr_err_once("Failed to allocate encrypted bitmap size %lx\n",
+				sz);
+		return -ENOMEM;
+	}
+
+	/* mark the page encrypted (by default) */
+	memset(map, 0xff, sz);
+
+	bitmap_copy(map, sev->page_enc_bmap, sev->page_enc_bmap_size);
+	kvfree(sev->page_enc_bmap);
+
+	sev->page_enc_bmap = map;
+	sev->page_enc_bmap_size = new_size;
+
+	return 0;
+}
+
+static int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
+				  unsigned long npages, unsigned long enc)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	kvm_pfn_t pfn_start, pfn_end;
+	gfn_t gfn_start, gfn_end;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -EINVAL;
+
+	if (!npages)
+		return 0;
+
+	gfn_start = gpa_to_gfn(gpa);
+	gfn_end = gfn_start + npages;
+
+	/* out of bound access error check */
+	if (gfn_end <= gfn_start)
+		return -EINVAL;
+
+	/* lets make sure that gpa exist in our memslot */
+	pfn_start = gfn_to_pfn(kvm, gfn_start);
+	pfn_end = gfn_to_pfn(kvm, gfn_end);
+
+	if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
+		/*
+		 * Allow guest MMIO range(s) to be added
+		 * to the page encryption bitmap.
+		 */
+		return -EINVAL;
+	}
+
+	if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
+		/*
+		 * Allow guest MMIO range(s) to be added
+		 * to the page encryption bitmap.
+		 */
+		return -EINVAL;
+	}
+
+	mutex_lock(&kvm->lock);
+	ret = sev_resize_page_enc_bitmap(kvm, gfn_end);
+	if (ret)
+		goto unlock;
+
+	if (enc)
+		__bitmap_set(sev->page_enc_bmap, gfn_start,
+				gfn_end - gfn_start);
+	else
+		__bitmap_clear(sev->page_enc_bmap, gfn_start,
+				gfn_end - gfn_start);
+
+unlock:
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -7995,6 +8088,8 @@  static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
 
 	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+	.page_enc_status_hc = svm_page_enc_status_hc,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 079d9fbf278e..f68e76ee7f9c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8001,6 +8001,7 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.nested_get_evmcs_version = NULL,
 	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
 	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+	.page_enc_status_hc = NULL,
 };
 
 static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf95c36cb4f4..68428eef2dde 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7564,6 +7564,12 @@  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		kvm_sched_yield(vcpu->kvm, a0);
 		ret = 0;
 		break;
+	case KVM_HC_PAGE_ENC_STATUS:
+		ret = -KVM_ENOSYS;
+		if (kvm_x86_ops->page_enc_status_hc)
+			ret = kvm_x86_ops->page_enc_status_hc(vcpu->kvm,
+					a0, a1, a2);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index 8b86609849b9..847b83b75dc8 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -29,6 +29,7 @@ 
 #define KVM_HC_CLOCK_PAIRING		9
 #define KVM_HC_SEND_IPI		10
 #define KVM_HC_SCHED_YIELD		11
+#define KVM_HC_PAGE_ENC_STATUS		12
 
 /*
  * hypercalls use architecture specific