diff mbox series

[v10,08/16] KVM: X86: Introduce KVM_HC_PAGE_ENC_STATUS hypercall

Message ID 245f84cca80417b490e47da17e711432716e2e06.1612398155.git.ashish.kalra@amd.com (mailing list archive)
State New, archived
Headers show
Series Add AMD SEV guest live migration support | expand

Commit Message

Kalra, Ashish Feb. 4, 2021, 12:38 a.m. UTC
From: Brijesh Singh <brijesh.singh@amd.com>

This hypercall is used by the SEV guest to notify a change in the page
encryption status to the hypervisor. The hypercall should be invoked
only when the encryption attribute is changed from encrypted -> decrypted
and vice versa. By default all guest pages are considered encrypted.

The patch introduces a new shared pages list implemented as a
sorted linked list to track the shared/unencrypted regions marked by the
guest hypercall.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Co-developed-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
---
 Documentation/virt/kvm/hypercalls.rst |  15 +++
 arch/x86/include/asm/kvm_host.h       |   2 +
 arch/x86/kvm/svm/sev.c                | 150 ++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c                |   2 +
 arch/x86/kvm/svm/svm.h                |   5 +
 arch/x86/kvm/vmx/vmx.c                |   1 +
 arch/x86/kvm/x86.c                    |   6 ++
 include/uapi/linux/kvm_para.h         |   1 +
 8 files changed, 182 insertions(+)

Comments

Tom Lendacky Feb. 4, 2021, 4:03 p.m. UTC | #1
On 2/3/21 6:38 PM, Ashish Kalra wrote:
> From: Brijesh Singh <brijesh.singh@amd.com>
> 
> This hypercall is used by the SEV guest to notify a change in the page
> encryption status to the hypervisor. The hypercall should be invoked
> only when the encryption attribute is changed from encrypted -> decrypted
> and vice versa. By default all guest pages are considered encrypted.
> 
> The patch introduces a new shared pages list implemented as a
> sorted linked list to track the shared/unencrypted regions marked by the
> guest hypercall.
> 

...

> +
> +	if (enc) {
> +		ret = remove_shared_region(gfn_start, gfn_end,
> +					   &sev->shared_pages_list);
> +		if (ret != -ENOMEM)
> +			sev->shared_pages_list_count += ret;
> +	} else {
> +		ret = add_shared_region(gfn_start, gfn_end,
> +					&sev->shared_pages_list);
> +		if (ret > 0)
> +			sev->shared_pages_list_count++;
> +	}

I would move the shared_pages_list_count updates into the add/remove 
functions and then just return 0 or a -EXXXX error code from those 
functions. It seems simpler than "adding" ret or checking for a greater 
than 0 return code.

Thanks,
Tom

> +
> +	mutex_unlock(&kvm->lock);
> +	return ret;
> +}
> +
>   int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>   {
>   	struct kvm_sev_cmd sev_cmd;
> @@ -1693,6 +1842,7 @@ void sev_vm_destroy(struct kvm *kvm)
>   
>   	sev_unbind_asid(kvm, sev->handle);
>   	sev_asid_free(sev->asid);
> +	sev->shared_pages_list_count = 0;
>   }
>   
>   void __init sev_hardware_setup(void)
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index f923e14e87df..bb249ec625fc 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4536,6 +4536,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>   	.complete_emulated_msr = svm_complete_emulated_msr,
>   
>   	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
> +
> +	.page_enc_status_hc = svm_page_enc_status_hc,
>   };
>   
>   static struct kvm_x86_init_ops svm_init_ops __initdata = {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 0fe874ae5498..6437c1fa1f24 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -79,6 +79,9 @@ struct kvm_sev_info {
>   	unsigned long pages_locked; /* Number of pages locked */
>   	struct list_head regions_list;  /* List of registered regions */
>   	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
> +	/* List and count of shared pages */
> +	int shared_pages_list_count;
> +	struct list_head shared_pages_list;
>   };
>   
>   struct kvm_svm {
> @@ -472,6 +475,8 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
>   			       bool has_error_code, u32 error_code);
>   int nested_svm_exit_special(struct vcpu_svm *svm);
>   void sync_nested_vmcb_control(struct vcpu_svm *svm);
> +int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
> +			   unsigned long npages, unsigned long enc);
>   
>   extern struct kvm_x86_nested_ops svm_nested_ops;
>   
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index cc60b1fc3ee7..bcbf53851612 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7705,6 +7705,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
>   	.can_emulate_instruction = vmx_can_emulate_instruction,
>   	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
>   	.migrate_timers = vmx_migrate_timers,
> +	.page_enc_status_hc = NULL,
>   
>   	.msr_filter_changed = vmx_msr_filter_changed,
>   	.complete_emulated_msr = kvm_complete_insn_gp,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 76bce832cade..2f17f0f9ace7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -8162,6 +8162,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
>   		kvm_sched_yield(vcpu->kvm, a0);
>   		ret = 0;
>   		break;
> +	case KVM_HC_PAGE_ENC_STATUS:
> +		ret = -KVM_ENOSYS;
> +		if (kvm_x86_ops.page_enc_status_hc)
> +			ret = kvm_x86_ops.page_enc_status_hc(vcpu->kvm,
> +					a0, a1, a2);
> +		break;
>   	default:
>   		ret = -KVM_ENOSYS;
>   		break;
> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
> index 8b86609849b9..847b83b75dc8 100644
> --- a/include/uapi/linux/kvm_para.h
> +++ b/include/uapi/linux/kvm_para.h
> @@ -29,6 +29,7 @@
>   #define KVM_HC_CLOCK_PAIRING		9
>   #define KVM_HC_SEND_IPI		10
>   #define KVM_HC_SCHED_YIELD		11
> +#define KVM_HC_PAGE_ENC_STATUS		12
>   
>   /*
>    * hypercalls use architecture specific
>
Steve Rutherford Feb. 5, 2021, 1:44 a.m. UTC | #2
On Wed, Feb 3, 2021 at 4:38 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
>
> From: Brijesh Singh <brijesh.singh@amd.com>
>
> This hypercall is used by the SEV guest to notify a change in the page
> encryption status to the hypervisor. The hypercall should be invoked
> only when the encryption attribute is changed from encrypted -> decrypted
> and vice versa. By default all guest pages are considered encrypted.
>
> The patch introduces a new shared pages list implemented as a
> sorted linked list to track the shared/unencrypted regions marked by the
> guest hypercall.
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: x86@kernel.org
> Cc: kvm@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> ---
>  Documentation/virt/kvm/hypercalls.rst |  15 +++
>  arch/x86/include/asm/kvm_host.h       |   2 +
>  arch/x86/kvm/svm/sev.c                | 150 ++++++++++++++++++++++++++
>  arch/x86/kvm/svm/svm.c                |   2 +
>  arch/x86/kvm/svm/svm.h                |   5 +
>  arch/x86/kvm/vmx/vmx.c                |   1 +
>  arch/x86/kvm/x86.c                    |   6 ++
>  include/uapi/linux/kvm_para.h         |   1 +
>  8 files changed, 182 insertions(+)
>
> diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> index ed4fddd364ea..7aff0cebab7c 100644
> --- a/Documentation/virt/kvm/hypercalls.rst
> +++ b/Documentation/virt/kvm/hypercalls.rst
> @@ -169,3 +169,18 @@ a0: destination APIC ID
>
>  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
>                 any of the IPI target vCPUs was preempted.
> +
> +
> +8. KVM_HC_PAGE_ENC_STATUS
> +-------------------------
> +:Architecture: x86
> +:Status: active
> +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> +
> +a0: the guest physical address of the start page
> +a1: the number of pages
> +a2: encryption attribute
> +
> +   Where:
> +       * 1: Encryption attribute is set
> +       * 0: Encryption attribute is cleared
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..2da5f5e2a10e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1301,6 +1301,8 @@ struct kvm_x86_ops {
>         int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
>
>         void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
> +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> +                                 unsigned long sz, unsigned long mode);
>  };
>
>  struct kvm_x86_nested_ops {
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 25eaf35ba51d..55c628df5155 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -45,6 +45,11 @@ struct enc_region {
>         unsigned long size;
>  };
>
> +struct shared_region {
> +       struct list_head list;
> +       unsigned long gfn_start, gfn_end;
> +};
> +
>  static int sev_flush_asids(void)
>  {
>         int ret, error = 0;
> @@ -196,6 +201,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>         sev->active = true;
>         sev->asid = asid;
>         INIT_LIST_HEAD(&sev->regions_list);
> +       INIT_LIST_HEAD(&sev->shared_pages_list);
> +       sev->shared_pages_list_count = 0;
>
>         return 0;
>
> @@ -1473,6 +1480,148 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
>         return ret;
>  }
>
> +static int remove_shared_region(unsigned long start, unsigned long end,
> +                               struct list_head *head)
> +{
> +       struct shared_region *pos;
> +
> +       list_for_each_entry(pos, head, list) {
> +               if (pos->gfn_start == start &&
> +                   pos->gfn_end == end) {
> +                       list_del(&pos->list);
> +                       kfree(pos);
> +                       return -1;
> +               } else if (start >= pos->gfn_start && end <= pos->gfn_end) {
> +                       if (start == pos->gfn_start)
> +                               pos->gfn_start = end + 1;
> +                       else if (end == pos->gfn_end)
> +                               pos->gfn_end = start - 1;
> +                       else {
> +                               /* Do a de-merge -- split linked list nodes */
> +                               unsigned long tmp;
> +                               struct shared_region *shrd_region;
> +
> +                               tmp = pos->gfn_end;
> +                               pos->gfn_end = start-1;
> +                               shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
> +                               if (!shrd_region)
> +                                       return -ENOMEM;
> +                               shrd_region->gfn_start = end + 1;
> +                               shrd_region->gfn_end = tmp;
> +                               list_add(&shrd_region->list, &pos->list);
> +                               return 1;
> +                       }
> +                       return 0;
> +               }
> +       }

This doesn't handle the case where the region being marked as
encrypted is larger than than the unencrypted region under
consideration, which (I believe) can happen with the current kexec
handling (since it is oblivious to the prior state).
I would probably break this down into the "five cases": no
intersection (skip), entry is completely contained (drop), entry
completely contains the removed region (split), intersect start
(chop), and intersect end (chop).

>
> +       return 0;
> +}
> +
> +static int add_shared_region(unsigned long start, unsigned long end,
> +                            struct list_head *shared_pages_list)
> +{
> +       struct list_head *head = shared_pages_list;
> +       struct shared_region *shrd_region;
> +       struct shared_region *pos;
> +
> +       if (list_empty(head)) {
> +               shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
> +               if (!shrd_region)
> +                       return -ENOMEM;
> +               shrd_region->gfn_start = start;
> +               shrd_region->gfn_end = end;
> +               list_add_tail(&shrd_region->list, head);
> +               return 1;
> +       }
> +
> +       /*
> +        * Shared pages list is a sorted list in ascending order of
> +        * guest PA's and also merges consecutive range of guest PA's
> +        */
> +       list_for_each_entry(pos, head, list) {
> +               if (pos->gfn_end < start)
> +                       continue;
> +               /* merge consecutive guest PA(s) */
> +               if (pos->gfn_start <= start && pos->gfn_end >= start) {
> +                       pos->gfn_end = end;

I'm not sure this correctly avoids having duplicate overlapping
elements in the list. It also doesn't merge consecutive contiguous
regions. Current guest implementation should never call the hypercall
with C=0 for the same region twice, without calling with c=1 in
between, but this API should be compatible with that model.

The easiest pattern would probably be to:
1) find (or insert) the node that will contain the added region.
2) remove the contents of the added region from the tail (will
typically do nothing).
3) merge the head of the tail into the current node, if the end of the
current node matches the start of that head.
>
> +                       return 0;
> +               }
> +               break;
> +       }
>
> +       /*
> +        * Add a new node, allocate nodes using GFP_KERNEL_ACCOUNT so that
> +        * kernel memory can be tracked/throttled in case a
> +        * malicious guest makes infinite number of hypercalls to
> +        * exhaust host kernel memory and cause a DOS attack.
> +        */
> +       shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
> +       if (!shrd_region)
> +               return -ENOMEM;
> +       shrd_region->gfn_start = start;
> +       shrd_region->gfn_end = end;
> +       list_add_tail(&shrd_region->list, &pos->list);
> +       return 1;
>
> +}
> +

Thanks!
Steve
Kalra, Ashish Feb. 5, 2021, 3:32 a.m. UTC | #3
Hello Steve,

On Thu, Feb 04, 2021 at 05:44:27PM -0800, Steve Rutherford wrote:
> On Wed, Feb 3, 2021 at 4:38 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote:
> >
> > From: Brijesh Singh <brijesh.singh@amd.com>
> >
> > This hypercall is used by the SEV guest to notify a change in the page
> > encryption status to the hypervisor. The hypercall should be invoked
> > only when the encryption attribute is changed from encrypted -> decrypted
> > and vice versa. By default all guest pages are considered encrypted.
> >
> > The patch introduces a new shared pages list implemented as a
> > sorted linked list to track the shared/unencrypted regions marked by the
> > guest hypercall.
> >
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> > Cc: Joerg Roedel <joro@8bytes.org>
> > Cc: Borislav Petkov <bp@suse.de>
> > Cc: Tom Lendacky <thomas.lendacky@amd.com>
> > Cc: x86@kernel.org
> > Cc: kvm@vger.kernel.org
> > Cc: linux-kernel@vger.kernel.org
> > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > Co-developed-by: Ashish Kalra <ashish.kalra@amd.com>
> > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > ---
> >  Documentation/virt/kvm/hypercalls.rst |  15 +++
> >  arch/x86/include/asm/kvm_host.h       |   2 +
> >  arch/x86/kvm/svm/sev.c                | 150 ++++++++++++++++++++++++++
> >  arch/x86/kvm/svm/svm.c                |   2 +
> >  arch/x86/kvm/svm/svm.h                |   5 +
> >  arch/x86/kvm/vmx/vmx.c                |   1 +
> >  arch/x86/kvm/x86.c                    |   6 ++
> >  include/uapi/linux/kvm_para.h         |   1 +
> >  8 files changed, 182 insertions(+)
> >
> > diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
> > index ed4fddd364ea..7aff0cebab7c 100644
> > --- a/Documentation/virt/kvm/hypercalls.rst
> > +++ b/Documentation/virt/kvm/hypercalls.rst
> > @@ -169,3 +169,18 @@ a0: destination APIC ID
> >
> >  :Usage example: When sending a call-function IPI-many to vCPUs, yield if
> >                 any of the IPI target vCPUs was preempted.
> > +
> > +
> > +8. KVM_HC_PAGE_ENC_STATUS
> > +-------------------------
> > +:Architecture: x86
> > +:Status: active
> > +:Purpose: Notify the encryption status changes in guest page table (SEV guest)
> > +
> > +a0: the guest physical address of the start page
> > +a1: the number of pages
> > +a2: encryption attribute
> > +
> > +   Where:
> > +       * 1: Encryption attribute is set
> > +       * 0: Encryption attribute is cleared
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 3d6616f6f6ef..2da5f5e2a10e 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1301,6 +1301,8 @@ struct kvm_x86_ops {
> >         int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
> >
> >         void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
> > +       int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
> > +                                 unsigned long sz, unsigned long mode);
> >  };
> >
> >  struct kvm_x86_nested_ops {
> > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> > index 25eaf35ba51d..55c628df5155 100644
> > --- a/arch/x86/kvm/svm/sev.c
> > +++ b/arch/x86/kvm/svm/sev.c
> > @@ -45,6 +45,11 @@ struct enc_region {
> >         unsigned long size;
> >  };
> >
> > +struct shared_region {
> > +       struct list_head list;
> > +       unsigned long gfn_start, gfn_end;
> > +};
> > +
> >  static int sev_flush_asids(void)
> >  {
> >         int ret, error = 0;
> > @@ -196,6 +201,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >         sev->active = true;
> >         sev->asid = asid;
> >         INIT_LIST_HEAD(&sev->regions_list);
> > +       INIT_LIST_HEAD(&sev->shared_pages_list);
> > +       sev->shared_pages_list_count = 0;
> >
> >         return 0;
> >
> > @@ -1473,6 +1480,148 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >         return ret;
> >  }
> >
> > +static int remove_shared_region(unsigned long start, unsigned long end,
> > +                               struct list_head *head)
> > +{
> > +       struct shared_region *pos;
> > +
> > +       list_for_each_entry(pos, head, list) {
> > +               if (pos->gfn_start == start &&
> > +                   pos->gfn_end == end) {
> > +                       list_del(&pos->list);
> > +                       kfree(pos);
> > +                       return -1;
> > +               } else if (start >= pos->gfn_start && end <= pos->gfn_end) {
> > +                       if (start == pos->gfn_start)
> > +                               pos->gfn_start = end + 1;
> > +                       else if (end == pos->gfn_end)
> > +                               pos->gfn_end = start - 1;
> > +                       else {
> > +                               /* Do a de-merge -- split linked list nodes */
> > +                               unsigned long tmp;
> > +                               struct shared_region *shrd_region;
> > +
> > +                               tmp = pos->gfn_end;
> > +                               pos->gfn_end = start-1;
> > +                               shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
> > +                               if (!shrd_region)
> > +                                       return -ENOMEM;
> > +                               shrd_region->gfn_start = end + 1;
> > +                               shrd_region->gfn_end = tmp;
> > +                               list_add(&shrd_region->list, &pos->list);
> > +                               return 1;
> > +                       }
> > +                       return 0;
> > +               }
> > +       }
> 
> This doesn't handle the case where the region being marked as
> encrypted is larger than than the unencrypted region under
> consideration, which (I believe) can happen with the current kexec
> handling (since it is oblivious to the prior state).
> I would probably break this down into the "five cases": no
> intersection (skip), entry is completely contained (drop), entry
> completely contains the removed region (split), intersect start
> (chop), and intersect end (chop).
> 

I believe that the above is already handling these cases :

1). no intersection (skip) : handled
2). entry is completely contained (drop) : handled
3). entry completely contains the removed region (split) : handled
4). intersect start in case of #3 (chop) : handled
5). intersect end in case of #3 (chop) : handled.

The one case it does not handle currently is where the region being marked 
as encrypted is larger than the unencrypted region under consideration.

> >
> > +       return 0;
> > +}
> > +
> > +static int add_shared_region(unsigned long start, unsigned long end,
> > +                            struct list_head *shared_pages_list)
> > +{
> > +       struct list_head *head = shared_pages_list;
> > +       struct shared_region *shrd_region;
> > +       struct shared_region *pos;
> > +
> > +       if (list_empty(head)) { > > +               shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
> > +               if (!shrd_region)
> > +                       return -ENOMEM;
> > +               shrd_region->gfn_start = start;
> > +               shrd_region->gfn_end = end;
> > +               list_add_tail(&shrd_region->list, head);
> > +               return 1;
> > +       }
> > +
> > +       /*
> > +        * Shared pages list is a sorted list in ascending order of
> > +        * guest PA's and also merges consecutive range of guest PA's
> > +        */
> > +       list_for_each_entry(pos, head, list) {
> > +               if (pos->gfn_end < start)
> > +                       continue;
> > +               /* merge consecutive guest PA(s) */
> > +               if (pos->gfn_start <= start && pos->gfn_end >= start) {
> > +                       pos->gfn_end = end;
> 
> I'm not sure this correctly avoids having duplicate overlapping
> elements in the list. It also doesn't merge consecutive contiguous
> regions. Current guest implementation should never call the hypercall
> with C=0 for the same region twice, without calling with c=1 in
> between, but this API should be compatible with that model.

Yes it does not handle duplicate overlapping elements being added in
the list, i will fix that. 

And merging consecutive contigious regions is supported, 
the code is merging consecutive GPAs as the example below shows :

...
[   50.894254] marking as unencrypted, gfn_start = c0000, gfn_end = fc000
[   50.894255] inserting new node @guest PA = c0000
[   50.894259] marking as unencrypted, gfn_start = fc000, gfn_end = fec00
[   50.894260] merging consecutive GPA start = c0000, end = fc000
[   50.894267] marking as unencrypted, gfn_start = fec00, gfn_end = fec01
[   50.894267] merging consecutive GPA start = c0000, end = fec00
[   50.894274] marking as unencrypted, gfn_start = fec01, gfn_end = fed00
[   50.894274] merging consecutive GPA start = c0000, end = fec01
[   50.894278] marking as unencrypted, gfn_start = fed00, gfn_end = fed01
[   50.894279] merging consecutive GPA start = c0000, end = fed00
[   50.894283] marking as unencrypted, gfn_start = fed00, gfn_end = fed1c
[   50.894283] merging consecutive GPA start = c0000, end = fed01
[   50.894287] marking as unencrypted, gfn_start = fed1c, gfn_end = fed20
[   50.894288] merging consecutive GPA start = c0000, end = fed1c
[   50.894294] marking as unencrypted, gfn_start = fed20, gfn_end = fee00
[   50.894294] merging consecutive GPA start = c0000, end = fed20
[   50.894303] marking as unencrypted, gfn_start = fee00, gfn_end = fef00
[   50.894304] merging consecutive GPA start = c0000, end = fee00
[   50.894669] marking as unencrypted, gfn_start = fef00, gfn_end = 1000000
...

The above is merged into a single entry, as shown below:

qemu-system-x86_64: gfn start = c0000, gfn_end = 1000000

> 
> The easiest pattern would probably be to:
> 1) find (or insert) the node that will contain the added region.

Except the find part, the above is handled.

> 2) remove the contents of the added region from the tail (will
> typically do nothing).
> 3) merge the head of the tail into the current node, if the end of the
> current node matches the start of that head.

This is also being handled.

Thanks,
Ashish

> >
> > +                       return 0;
> > +               }
> > +               break;
> > +       }
> >
> > +       /*
> > +        * Add a new node, allocate nodes using GFP_KERNEL_ACCOUNT so that
> > +        * kernel memory can be tracked/throttled in case a
> > +        * malicious guest makes infinite number of hypercalls to
> > +        * exhaust host kernel memory and cause a DOS attack.
> > +        */
> > +       shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
> > +       if (!shrd_region)
> > +               return -ENOMEM;
> > +       shrd_region->gfn_start = start;
> > +       shrd_region->gfn_end = end;
> > +       list_add_tail(&shrd_region->list, &pos->list);
> > +       return 1;
> >
> > +}
> > +
> 
> Thanks!
> Steve
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst
index ed4fddd364ea..7aff0cebab7c 100644
--- a/Documentation/virt/kvm/hypercalls.rst
+++ b/Documentation/virt/kvm/hypercalls.rst
@@ -169,3 +169,18 @@  a0: destination APIC ID
 
 :Usage example: When sending a call-function IPI-many to vCPUs, yield if
 	        any of the IPI target vCPUs was preempted.
+
+
+8. KVM_HC_PAGE_ENC_STATUS
+-------------------------
+:Architecture: x86
+:Status: active
+:Purpose: Notify the encryption status changes in guest page table (SEV guest)
+
+a0: the guest physical address of the start page
+a1: the number of pages
+a2: encryption attribute
+
+   Where:
+	* 1: Encryption attribute is set
+	* 0: Encryption attribute is cleared
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..2da5f5e2a10e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1301,6 +1301,8 @@  struct kvm_x86_ops {
 	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
 
 	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+	int (*page_enc_status_hc)(struct kvm *kvm, unsigned long gpa,
+				  unsigned long sz, unsigned long mode);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 25eaf35ba51d..55c628df5155 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -45,6 +45,11 @@  struct enc_region {
 	unsigned long size;
 };
 
+struct shared_region {
+	struct list_head list;
+	unsigned long gfn_start, gfn_end;
+};
+
 static int sev_flush_asids(void)
 {
 	int ret, error = 0;
@@ -196,6 +201,8 @@  static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	sev->active = true;
 	sev->asid = asid;
 	INIT_LIST_HEAD(&sev->regions_list);
+	INIT_LIST_HEAD(&sev->shared_pages_list);
+	sev->shared_pages_list_count = 0;
 
 	return 0;
 
@@ -1473,6 +1480,148 @@  static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int remove_shared_region(unsigned long start, unsigned long end,
+				struct list_head *head)
+{
+	struct shared_region *pos;
+
+	list_for_each_entry(pos, head, list) {
+		if (pos->gfn_start == start &&
+		    pos->gfn_end == end) {
+			list_del(&pos->list);
+			kfree(pos);
+			return -1;
+		} else if (start >= pos->gfn_start && end <= pos->gfn_end) {
+			if (start == pos->gfn_start)
+				pos->gfn_start = end + 1;
+			else if (end == pos->gfn_end)
+				pos->gfn_end = start - 1;
+			else {
+				/* Do a de-merge -- split linked list nodes */
+				unsigned long tmp;
+				struct shared_region *shrd_region;
+
+				tmp = pos->gfn_end;
+				pos->gfn_end = start-1;
+				shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
+				if (!shrd_region)
+					return -ENOMEM;
+				shrd_region->gfn_start = end + 1;
+				shrd_region->gfn_end = tmp;
+				list_add(&shrd_region->list, &pos->list);
+				return 1;
+			}
+			return 0;
+		}
+	}
+	return 0;
+}
+
+static int add_shared_region(unsigned long start, unsigned long end,
+			     struct list_head *shared_pages_list)
+{
+	struct list_head *head = shared_pages_list;
+	struct shared_region *shrd_region;
+	struct shared_region *pos;
+
+	if (list_empty(head)) {
+		shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
+		if (!shrd_region)
+			return -ENOMEM;
+		shrd_region->gfn_start = start;
+		shrd_region->gfn_end = end;
+		list_add_tail(&shrd_region->list, head);
+		return 1;
+	}
+
+	/*
+	 * Shared pages list is a sorted list in ascending order of
+	 * guest PA's and also merges consecutive range of guest PA's
+	 */
+	list_for_each_entry(pos, head, list) {
+		if (pos->gfn_end < start)
+			continue;
+		/* merge consecutive guest PA(s) */
+		if (pos->gfn_start <= start && pos->gfn_end >= start) {
+			pos->gfn_end = end;
+			return 0;
+		}
+		break;
+	}
+	/*
+	 * Add a new node, allocate nodes using GFP_KERNEL_ACCOUNT so that
+	 * kernel memory can be tracked/throttled in case a
+	 * malicious guest makes infinite number of hypercalls to
+	 * exhaust host kernel memory and cause a DOS attack.
+	 */
+	shrd_region = kzalloc(sizeof(*shrd_region), GFP_KERNEL_ACCOUNT);
+	if (!shrd_region)
+		return -ENOMEM;
+	shrd_region->gfn_start = start;
+	shrd_region->gfn_end = end;
+	list_add_tail(&shrd_region->list, &pos->list);
+	return 1;
+}
+
+int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
+			   unsigned long npages, unsigned long enc)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	kvm_pfn_t pfn_start, pfn_end;
+	gfn_t gfn_start, gfn_end;
+	int ret = 0;
+
+	if (!sev_guest(kvm))
+		return -EINVAL;
+
+	if (!npages)
+		return 0;
+
+	gfn_start = gpa_to_gfn(gpa);
+	gfn_end = gfn_start + npages;
+
+	/* out of bound access error check */
+	if (gfn_end <= gfn_start)
+		return -EINVAL;
+
+	/* lets make sure that gpa exist in our memslot */
+	pfn_start = gfn_to_pfn(kvm, gfn_start);
+	pfn_end = gfn_to_pfn(kvm, gfn_end);
+
+	if (is_error_noslot_pfn(pfn_start) && !is_noslot_pfn(pfn_start)) {
+		/*
+		 * Allow guest MMIO range(s) to be added
+		 * to the shared pages list.
+		 */
+		return -EINVAL;
+	}
+
+	if (is_error_noslot_pfn(pfn_end) && !is_noslot_pfn(pfn_end)) {
+		/*
+		 * Allow guest MMIO range(s) to be added
+		 * to the shared pages list.
+		 */
+		return -EINVAL;
+	}
+
+	mutex_lock(&kvm->lock);
+
+	if (enc) {
+		ret = remove_shared_region(gfn_start, gfn_end,
+					   &sev->shared_pages_list);
+		if (ret != -ENOMEM)
+			sev->shared_pages_list_count += ret;
+	} else {
+		ret = add_shared_region(gfn_start, gfn_end,
+					&sev->shared_pages_list);
+		if (ret > 0)
+			sev->shared_pages_list_count++;
+	}
+
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1693,6 +1842,7 @@  void sev_vm_destroy(struct kvm *kvm)
 
 	sev_unbind_asid(kvm, sev->handle);
 	sev_asid_free(sev->asid);
+	sev->shared_pages_list_count = 0;
 }
 
 void __init sev_hardware_setup(void)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f923e14e87df..bb249ec625fc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4536,6 +4536,8 @@  static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.complete_emulated_msr = svm_complete_emulated_msr,
 
 	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+
+	.page_enc_status_hc = svm_page_enc_status_hc,
 };
 
 static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 0fe874ae5498..6437c1fa1f24 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,6 +79,9 @@  struct kvm_sev_info {
 	unsigned long pages_locked; /* Number of pages locked */
 	struct list_head regions_list;  /* List of registered regions */
 	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
+	/* List and count of shared pages */
+	int shared_pages_list_count;
+	struct list_head shared_pages_list;
 };
 
 struct kvm_svm {
@@ -472,6 +475,8 @@  int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 			       bool has_error_code, u32 error_code);
 int nested_svm_exit_special(struct vcpu_svm *svm);
 void sync_nested_vmcb_control(struct vcpu_svm *svm);
+int svm_page_enc_status_hc(struct kvm *kvm, unsigned long gpa,
+			   unsigned long npages, unsigned long enc);
 
 extern struct kvm_x86_nested_ops svm_nested_ops;
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cc60b1fc3ee7..bcbf53851612 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7705,6 +7705,7 @@  static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.can_emulate_instruction = vmx_can_emulate_instruction,
 	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
 	.migrate_timers = vmx_migrate_timers,
+	.page_enc_status_hc = NULL,
 
 	.msr_filter_changed = vmx_msr_filter_changed,
 	.complete_emulated_msr = kvm_complete_insn_gp,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 76bce832cade..2f17f0f9ace7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8162,6 +8162,12 @@  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		kvm_sched_yield(vcpu->kvm, a0);
 		ret = 0;
 		break;
+	case KVM_HC_PAGE_ENC_STATUS:
+		ret = -KVM_ENOSYS;
+		if (kvm_x86_ops.page_enc_status_hc)
+			ret = kvm_x86_ops.page_enc_status_hc(vcpu->kvm,
+					a0, a1, a2);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index 8b86609849b9..847b83b75dc8 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -29,6 +29,7 @@ 
 #define KVM_HC_CLOCK_PAIRING		9
 #define KVM_HC_SEND_IPI		10
 #define KVM_HC_SCHED_YIELD		11
+#define KVM_HC_PAGE_ENC_STATUS		12
 
 /*
  * hypercalls use architecture specific