diff mbox series

[RFC,3/6] KVM: SVM: Implement demand page pinning

Message ID 20220118110621.62462-4-nikunj@amd.com (mailing list archive)
State New, archived
Headers show
Series KVM: SVM: Defer page pinning for SEV guests | expand

Commit Message

Nikunj A. Dadhania Jan. 18, 2022, 11:06 a.m. UTC
Use the memslot metadata to store the pinned data along with the pfns.
This improves the SEV guest startup time from O(n) to a constant by
deferring guest page pinning until the pages are used to satisfy nested
page faults. The page reference will be dropped in the memslot free
path.

Remove the enc_region structure definition and the code which did
upfront pinning, as they are no longer needed in view of the demand
pinning support.

Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
since qemu is dependent on this API.

Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
---
 arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
 arch/x86/kvm/svm/svm.c |   1 +
 arch/x86/kvm/svm/svm.h |   3 +-
 3 files changed, 81 insertions(+), 131 deletions(-)

Comments

Peter Gonda Jan. 25, 2022, 4:47 p.m. UTC | #1
On Tue, Jan 18, 2022 at 4:07 AM Nikunj A Dadhania <nikunj@amd.com> wrote:
>
> Use the memslot metadata to store the pinned data along with the pfns.
> This improves the SEV guest startup time from O(n) to a constant by
> deferring guest page pinning until the pages are used to satisfy nested
> page faults. The page reference will be dropped in the memslot free
> path.
>
> Remove the enc_region structure definition and the code which did
> upfront pinning, as they are no longer needed in view of the demand
> pinning support.
>
> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
> since qemu is dependent on this API.
>
> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
> ---
>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>  arch/x86/kvm/svm/svm.c |   1 +
>  arch/x86/kvm/svm/svm.h |   3 +-
>  3 files changed, 81 insertions(+), 131 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index d972ab4956d4..a962bed97a0b 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>  static unsigned long *sev_asid_bitmap;
>  static unsigned long *sev_reclaim_asid_bitmap;
>
> -struct enc_region {
> -       struct list_head list;
> -       unsigned long npages;
> -       struct page **pages;
> -       unsigned long uaddr;
> -       unsigned long size;
> -};
> -
>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>  static int sev_flush_asids(int min_asid, int max_asid)
>  {
> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>         if (ret)
>                 goto e_free;
>
> -       INIT_LIST_HEAD(&sev->regions_list);
> -
>         return 0;
>
>  e_free:
> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>         src->handle = 0;
>         src->pages_locked = 0;
>         src->enc_context_owner = NULL;
> -
> -       list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);

I think we need to move the pinned SPTE entries into the target, and
repin the pages in the target here. Otherwise the pages will be
unpinned when the source is cleaned up. Have you thought about how
this could be done?

>  }
>
>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>  int svm_register_enc_region(struct kvm *kvm,
>                             struct kvm_enc_region *range)
>  {
> -       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -       struct enc_region *region;
> -       int ret = 0;
> -
> -       if (!sev_guest(kvm))
> -               return -ENOTTY;
> -
> -       /* If kvm is mirroring encryption context it isn't responsible for it */
> -       if (is_mirroring_enc_context(kvm))
> -               return -EINVAL;
> -
> -       if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
> -               return -EINVAL;
> -
> -       region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
> -       if (!region)
> -               return -ENOMEM;
> -
> -       mutex_lock(&kvm->lock);
> -       region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
> -       if (IS_ERR(region->pages)) {
> -               ret = PTR_ERR(region->pages);
> -               mutex_unlock(&kvm->lock);
> -               goto e_free;
> -       }
> -
> -       region->uaddr = range->addr;
> -       region->size = range->size;
> -
> -       list_add_tail(&region->list, &sev->regions_list);
> -       mutex_unlock(&kvm->lock);
> -
> -       /*
> -        * The guest may change the memory encryption attribute from C=0 -> C=1
> -        * or vice versa for this memory range. Lets make sure caches are
> -        * flushed to ensure that guest data gets written into memory with
> -        * correct C-bit.
> -        */
> -       sev_clflush_pages(region->pages, region->npages);
> -
> -       return ret;
> -
> -e_free:
> -       kfree(region);
> -       return ret;
> -}
> -
> -static struct enc_region *
> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
> -{
> -       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -       struct list_head *head = &sev->regions_list;
> -       struct enc_region *i;
> -
> -       list_for_each_entry(i, head, list) {
> -               if (i->uaddr == range->addr &&
> -                   i->size == range->size)
> -                       return i;
> -       }
> -
> -       return NULL;
> -}
> -
> -static void __unregister_enc_region_locked(struct kvm *kvm,
> -                                          struct enc_region *region)
> -{
> -       sev_unpin_memory(kvm, region->pages, region->npages);
> -       list_del(&region->list);
> -       kfree(region);
> +       return 0;
>  }
>
>  int svm_unregister_enc_region(struct kvm *kvm,
>                               struct kvm_enc_region *range)
>  {
> -       struct enc_region *region;
> -       int ret;
> -
> -       /* If kvm is mirroring encryption context it isn't responsible for it */
> -       if (is_mirroring_enc_context(kvm))
> -               return -EINVAL;
> -
> -       mutex_lock(&kvm->lock);
> -
> -       if (!sev_guest(kvm)) {
> -               ret = -ENOTTY;
> -               goto failed;
> -       }
> -
> -       region = find_enc_region(kvm, range);
> -       if (!region) {
> -               ret = -EINVAL;
> -               goto failed;
> -       }
> -
> -       /*
> -        * Ensure that all guest tagged cache entries are flushed before
> -        * releasing the pages back to the system for use. CLFLUSH will
> -        * not do this, so issue a WBINVD.
> -        */
> -       wbinvd_on_all_cpus();
> -
> -       __unregister_enc_region_locked(kvm, region);
> -
> -       mutex_unlock(&kvm->lock);
>         return 0;
> -
> -failed:
> -       mutex_unlock(&kvm->lock);
> -       return ret;
>  }
>
>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>         mirror_sev->fd = source_sev->fd;
>         mirror_sev->es_active = source_sev->es_active;
>         mirror_sev->handle = source_sev->handle;
> -       INIT_LIST_HEAD(&mirror_sev->regions_list);
>         ret = 0;
>
>         /*
> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>  void sev_vm_destroy(struct kvm *kvm)
>  {
>         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -       struct list_head *head = &sev->regions_list;
> -       struct list_head *pos, *q;
>
>         WARN_ON(sev->num_mirrored_vms);
>
> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>          */
>         wbinvd_on_all_cpus();
>
> -       /*
> -        * if userspace was terminated before unregistering the memory regions
> -        * then lets unpin all the registered memory.
> -        */
> -       if (!list_empty(head)) {
> -               list_for_each_safe(pos, q, head) {
> -                       __unregister_enc_region_locked(kvm,
> -                               list_entry(pos, struct enc_region, list));
> -                       cond_resched();
> -               }
> -       }
> -
>         sev_unbind_asid(kvm, sev->handle);
>         sev_asid_free(sev);
>  }
> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>         ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>  }
>
> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +                 kvm_pfn_t pfn)
> +{
> +       struct kvm_arch_memory_slot *aslot;
> +       struct kvm_memory_slot *slot;
> +       gfn_t rel_gfn, pin_pfn;
> +       unsigned long npages;
> +       kvm_pfn_t old_pfn;
> +       int i;
> +
> +       if (!sev_guest(kvm))
> +               return;
> +
> +       if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
> +               return;
> +
> +       /* Tested till 1GB pages */
> +       if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
> +               return;
> +
> +       slot = gfn_to_memslot(kvm, gfn);
> +       if (!slot || !slot->arch.pfns)
> +               return;
> +
> +       /*
> +        * Use relative gfn index within the memslot for the bitmap as well as
> +        * the pfns array
> +        */
> +       rel_gfn = gfn - slot->base_gfn;
> +       aslot = &slot->arch;
> +       pin_pfn = pfn;
> +       npages = KVM_PAGES_PER_HPAGE(level);
> +
> +       /* Pin the page, KVM doesn't yet support page migration. */
> +       for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
> +               if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
> +                       old_pfn = aslot->pfns[rel_gfn];
> +                       if (old_pfn == pin_pfn)
> +                               continue;
> +
> +                       put_page(pfn_to_page(old_pfn));
> +               }
> +
> +               set_bit(rel_gfn, aslot->pinned_bitmap);
> +               aslot->pfns[rel_gfn] = pin_pfn;
> +               get_page(pfn_to_page(pin_pfn));
> +       }
> +
> +       /*
> +        * Flush any cached lines of the page being added since "ownership" of
> +        * it will be transferred from the host to an encrypted guest.
> +        */
> +       clflush_cache_range(__va(pfn << PAGE_SHIFT), page_level_size(level));
> +}
> +
>  void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
>  {
>         struct kvm_arch_memory_slot *aslot = &slot->arch;
> +       kvm_pfn_t *pfns;
> +       gfn_t gfn;
> +       int i;
>
>         if (!sev_guest(kvm))
>                 return;
>
> +       if (!aslot->pinned_bitmap || !slot->arch.pfns)
> +               goto out;
> +
> +       pfns = aslot->pfns;
> +
> +       /*
> +        * Iterate the memslot to find the pinned pfn using the bitmap and drop
> +        * the pfn stored.
> +        */
> +       for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) {
> +               if (test_and_clear_bit(i, aslot->pinned_bitmap)) {
> +                       if (WARN_ON(!pfns[i]))
> +                               continue;
> +
> +                       put_page(pfn_to_page(pfns[i]));
> +               }
> +       }
> +
> +out:
>         if (aslot->pinned_bitmap) {
>                 kvfree(aslot->pinned_bitmap);
>                 aslot->pinned_bitmap = NULL;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 3fb19974f719..22535c680b3f 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4743,6 +4743,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>
>         .alloc_memslot_metadata = sev_alloc_memslot_metadata,
>         .free_memslot = sev_free_memslot,
> +       .pin_spte = sev_pin_spte,
>  };
>
>  static struct kvm_x86_init_ops svm_init_ops __initdata = {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index b2f8b3b52680..c731bc91ea8f 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -77,7 +77,6 @@ struct kvm_sev_info {
>         unsigned int handle;    /* SEV firmware handle */
>         int fd;                 /* SEV device fd */
>         unsigned long pages_locked; /* Number of pages locked */
> -       struct list_head regions_list;  /* List of registered regions */
>         u64 ap_jump_table;      /* SEV-ES AP Jump Table address */
>         struct kvm *enc_context_owner; /* Owner of copied encryption context */
>         unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
> @@ -648,5 +647,7 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
>                                struct kvm_memory_slot *new);
>  void sev_free_memslot(struct kvm *kvm,
>                       struct kvm_memory_slot *slot);
> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +                 kvm_pfn_t pfn);
>
>  #endif
> --
> 2.32.0
>
Nikunj A. Dadhania Jan. 25, 2022, 5:49 p.m. UTC | #2
Hi Peter

On 1/25/2022 10:17 PM, Peter Gonda wrote:
>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>         src->handle = 0;
>>         src->pages_locked = 0;
>>         src->enc_context_owner = NULL;
>> -
>> -       list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
> I think we need to move the pinned SPTE entries into the target, and
> repin the pages in the target here. Otherwise the pages will be
> unpinned when the source is cleaned up. Have you thought about how
> this could be done?
> 
I am testing migration with pinned_list, I see that all the guest pages are 
transferred/pinned on the other side during migration. I think that there is 
assumption that all private pages needs to be moved.

QEMU: target/i386/sev.c:bool sev_is_gfn_in_unshared_region(unsigned long gfn)

Will dig more on this.

Regards
Nikunj
Peter Gonda Jan. 25, 2022, 5:59 p.m. UTC | #3
On Tue, Jan 25, 2022 at 10:49 AM Nikunj A. Dadhania <nikunj@amd.com> wrote:
>
> Hi Peter
>
> On 1/25/2022 10:17 PM, Peter Gonda wrote:
> >> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
> >>         src->handle = 0;
> >>         src->pages_locked = 0;
> >>         src->enc_context_owner = NULL;
> >> -
> >> -       list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
> > I think we need to move the pinned SPTE entries into the target, and
> > repin the pages in the target here. Otherwise the pages will be
> > unpinned when the source is cleaned up. Have you thought about how
> > this could be done?
> >
> I am testing migration with pinned_list, I see that all the guest pages are
> transferred/pinned on the other side during migration. I think that there is
> assumption that all private pages needs to be moved.
>
> QEMU: target/i386/sev.c:bool sev_is_gfn_in_unshared_region(unsigned long gfn)
>
> Will dig more on this.

The code you linked appears to be for a remote migration. This
function is for an "intra-host" migration meaning we are just moving
the VMs memory and state to a new userspace VMM on the same not an
entirely new host.

>
> Regards
> Nikunj
David Hildenbrand Jan. 26, 2022, 10:46 a.m. UTC | #4
On 18.01.22 12:06, Nikunj A Dadhania wrote:
> Use the memslot metadata to store the pinned data along with the pfns.
> This improves the SEV guest startup time from O(n) to a constant by
> deferring guest page pinning until the pages are used to satisfy nested
> page faults. The page reference will be dropped in the memslot free
> path.
> 
> Remove the enc_region structure definition and the code which did
> upfront pinning, as they are no longer needed in view of the demand
> pinning support.
> 
> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
> since qemu is dependent on this API.
> 
> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
> ---
>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>  arch/x86/kvm/svm/svm.c |   1 +
>  arch/x86/kvm/svm/svm.h |   3 +-
>  3 files changed, 81 insertions(+), 131 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index d972ab4956d4..a962bed97a0b 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>  static unsigned long *sev_asid_bitmap;
>  static unsigned long *sev_reclaim_asid_bitmap;
>  
> -struct enc_region {
> -	struct list_head list;
> -	unsigned long npages;
> -	struct page **pages;
> -	unsigned long uaddr;
> -	unsigned long size;
> -};
> -
>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>  static int sev_flush_asids(int min_asid, int max_asid)
>  {
> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>  	if (ret)
>  		goto e_free;
>  
> -	INIT_LIST_HEAD(&sev->regions_list);
> -
>  	return 0;
>  
>  e_free:
> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>  	src->handle = 0;
>  	src->pages_locked = 0;
>  	src->enc_context_owner = NULL;
> -
> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>  }
>  
>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>  int svm_register_enc_region(struct kvm *kvm,
>  			    struct kvm_enc_region *range)
>  {
> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -	struct enc_region *region;
> -	int ret = 0;
> -
> -	if (!sev_guest(kvm))
> -		return -ENOTTY;
> -
> -	/* If kvm is mirroring encryption context it isn't responsible for it */
> -	if (is_mirroring_enc_context(kvm))
> -		return -EINVAL;
> -
> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
> -		return -EINVAL;
> -
> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
> -	if (!region)
> -		return -ENOMEM;
> -
> -	mutex_lock(&kvm->lock);
> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
> -	if (IS_ERR(region->pages)) {
> -		ret = PTR_ERR(region->pages);
> -		mutex_unlock(&kvm->lock);
> -		goto e_free;
> -	}
> -
> -	region->uaddr = range->addr;
> -	region->size = range->size;
> -
> -	list_add_tail(&region->list, &sev->regions_list);
> -	mutex_unlock(&kvm->lock);
> -
> -	/*
> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
> -	 * or vice versa for this memory range. Lets make sure caches are
> -	 * flushed to ensure that guest data gets written into memory with
> -	 * correct C-bit.
> -	 */
> -	sev_clflush_pages(region->pages, region->npages);
> -
> -	return ret;
> -
> -e_free:
> -	kfree(region);
> -	return ret;
> -}
> -
> -static struct enc_region *
> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
> -{
> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -	struct list_head *head = &sev->regions_list;
> -	struct enc_region *i;
> -
> -	list_for_each_entry(i, head, list) {
> -		if (i->uaddr == range->addr &&
> -		    i->size == range->size)
> -			return i;
> -	}
> -
> -	return NULL;
> -}
> -
> -static void __unregister_enc_region_locked(struct kvm *kvm,
> -					   struct enc_region *region)
> -{
> -	sev_unpin_memory(kvm, region->pages, region->npages);
> -	list_del(&region->list);
> -	kfree(region);
> +	return 0;
>  }
>  
>  int svm_unregister_enc_region(struct kvm *kvm,
>  			      struct kvm_enc_region *range)
>  {
> -	struct enc_region *region;
> -	int ret;
> -
> -	/* If kvm is mirroring encryption context it isn't responsible for it */
> -	if (is_mirroring_enc_context(kvm))
> -		return -EINVAL;
> -
> -	mutex_lock(&kvm->lock);
> -
> -	if (!sev_guest(kvm)) {
> -		ret = -ENOTTY;
> -		goto failed;
> -	}
> -
> -	region = find_enc_region(kvm, range);
> -	if (!region) {
> -		ret = -EINVAL;
> -		goto failed;
> -	}
> -
> -	/*
> -	 * Ensure that all guest tagged cache entries are flushed before
> -	 * releasing the pages back to the system for use. CLFLUSH will
> -	 * not do this, so issue a WBINVD.
> -	 */
> -	wbinvd_on_all_cpus();
> -
> -	__unregister_enc_region_locked(kvm, region);
> -
> -	mutex_unlock(&kvm->lock);
>  	return 0;
> -
> -failed:
> -	mutex_unlock(&kvm->lock);
> -	return ret;
>  }
>  
>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>  	mirror_sev->fd = source_sev->fd;
>  	mirror_sev->es_active = source_sev->es_active;
>  	mirror_sev->handle = source_sev->handle;
> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>  	ret = 0;
>  
>  	/*
> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>  void sev_vm_destroy(struct kvm *kvm)
>  {
>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -	struct list_head *head = &sev->regions_list;
> -	struct list_head *pos, *q;
>  
>  	WARN_ON(sev->num_mirrored_vms);
>  
> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>  	 */
>  	wbinvd_on_all_cpus();
>  
> -	/*
> -	 * if userspace was terminated before unregistering the memory regions
> -	 * then lets unpin all the registered memory.
> -	 */
> -	if (!list_empty(head)) {
> -		list_for_each_safe(pos, q, head) {
> -			__unregister_enc_region_locked(kvm,
> -				list_entry(pos, struct enc_region, list));
> -			cond_resched();
> -		}
> -	}
> -
>  	sev_unbind_asid(kvm, sev->handle);
>  	sev_asid_free(sev);
>  }
> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>  }
>  
> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +		  kvm_pfn_t pfn)
> +{
> +	struct kvm_arch_memory_slot *aslot;
> +	struct kvm_memory_slot *slot;
> +	gfn_t rel_gfn, pin_pfn;
> +	unsigned long npages;
> +	kvm_pfn_t old_pfn;
> +	int i;
> +
> +	if (!sev_guest(kvm))
> +		return;
> +
> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
> +		return;
> +
> +	/* Tested till 1GB pages */
> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
> +		return;
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (!slot || !slot->arch.pfns)
> +		return;
> +
> +	/*
> +	 * Use relative gfn index within the memslot for the bitmap as well as
> +	 * the pfns array
> +	 */
> +	rel_gfn = gfn - slot->base_gfn;
> +	aslot = &slot->arch;
> +	pin_pfn = pfn;
> +	npages = KVM_PAGES_PER_HPAGE(level);
> +
> +	/* Pin the page, KVM doesn't yet support page migration. */
> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
> +			old_pfn = aslot->pfns[rel_gfn];
> +			if (old_pfn == pin_pfn)
> +				continue;
> +
> +			put_page(pfn_to_page(old_pfn));
> +		}
> +
> +		set_bit(rel_gfn, aslot->pinned_bitmap);
> +		aslot->pfns[rel_gfn] = pin_pfn;
> +		get_page(pfn_to_page(pin_pfn));


I assume this is to replace KVM_MEMORY_ENCRYPT_REG_REGION, which ends up
calling svm_register_enc_region()->sev_pin_memory(), correct?

sev_pin_memory() correctly checks the RLIMIT_MEMLOCK and uses
pin_user_pages_fast().

I have to strongly assume that sev_pin_memory() is *wrong* as is because
it's supposed to supply FOLL_LONGTERM -- after all we're pinning these
pages possibly forever.


I might be wrong but

1. You are missing the RLIMIT_MEMLOCK check

2. get_page() is the wong way of long-term pinning a page. You would
have to mimic what pin_user_pages_fast(FOLL_LONGTERM) does to eventually
get it right (e.g., migrate the page off of MIGRATE_CMA or ZONE_MOVABLE).
Nikunj A. Dadhania Jan. 27, 2022, 4:29 p.m. UTC | #5
On 1/25/2022 11:29 PM, Peter Gonda wrote:
> On Tue, Jan 25, 2022 at 10:49 AM Nikunj A. Dadhania <nikunj@amd.com> wrote:
>>
>> Hi Peter
>>
>> On 1/25/2022 10:17 PM, Peter Gonda wrote:
>>>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>>>         src->handle = 0;
>>>>         src->pages_locked = 0;
>>>>         src->enc_context_owner = NULL;
>>>> -
>>>> -       list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>>> I think we need to move the pinned SPTE entries into the target, and
>>> repin the pages in the target here. Otherwise the pages will be
>>> unpinned when the source is cleaned up. Have you thought about how
>>> this could be done?

Right, copying just the list doesn't look to be sufficient. 

In destination kvm context, will have to go over the source region list of 
pinned pages and pin them. Roughly something like the below:

struct list_head *head = &src->pinned_regions_list;
struct pinned_region *new, old;

if (!list_empty(head)) {
	list_for_each_safe(pos, q, head) {
		old = list_entry(pos, struct pinned_region, list);
		/* alloc new region and initialize with old */
		new = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
		new->uaddr = old->uaddr;
		new->len = old->len;
		new->npages = old->npages;
		/* pin memory */
		new->pages = sev_pin_memory(kvm, new->uaddr, new->npages);
		list_add_tail(&new->list, &dst->pinned_regions_list);
		...
	}
}

>>>
>> I am testing migration with pinned_list, I see that all the guest pages are
>> transferred/pinned on the other side during migration. I think that there is
>> assumption that all private pages needs to be moved.
>>
>> QEMU: target/i386/sev.c:bool sev_is_gfn_in_unshared_region(unsigned long gfn)
>>
>> Will dig more on this.
> 
> The code you linked appears to be for a remote migration. 

Yes, that is correct.

> This
> function is for an "intra-host" migration meaning we are just moving
> the VMs memory and state to a new userspace VMM on the same not an
> entirely new host.

Regards
Nikunj
Nikunj A. Dadhania Jan. 28, 2022, 6:57 a.m. UTC | #6
On 1/26/2022 4:16 PM, David Hildenbrand wrote:
> On 18.01.22 12:06, Nikunj A Dadhania wrote:
>> Use the memslot metadata to store the pinned data along with the pfns.
>> This improves the SEV guest startup time from O(n) to a constant by
>> deferring guest page pinning until the pages are used to satisfy nested
>> page faults. The page reference will be dropped in the memslot free
>> path.
>>
>> Remove the enc_region structure definition and the code which did
>> upfront pinning, as they are no longer needed in view of the demand
>> pinning support.
>>
>> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
>> since qemu is dependent on this API.
>>
>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>> ---
>>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>>  arch/x86/kvm/svm/svm.c |   1 +
>>  arch/x86/kvm/svm/svm.h |   3 +-
>>  3 files changed, 81 insertions(+), 131 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>> index d972ab4956d4..a962bed97a0b 100644
>> --- a/arch/x86/kvm/svm/sev.c
>> +++ b/arch/x86/kvm/svm/sev.c
>> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>>  static unsigned long *sev_asid_bitmap;
>>  static unsigned long *sev_reclaim_asid_bitmap;
>>  
>> -struct enc_region {
>> -	struct list_head list;
>> -	unsigned long npages;
>> -	struct page **pages;
>> -	unsigned long uaddr;
>> -	unsigned long size;
>> -};
>> -
>>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>>  static int sev_flush_asids(int min_asid, int max_asid)
>>  {
>> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>  	if (ret)
>>  		goto e_free;
>>  
>> -	INIT_LIST_HEAD(&sev->regions_list);
>> -
>>  	return 0;
>>  
>>  e_free:
>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>  	src->handle = 0;
>>  	src->pages_locked = 0;
>>  	src->enc_context_owner = NULL;
>> -
>> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>>  }
>>  
>>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
>> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>  int svm_register_enc_region(struct kvm *kvm,
>>  			    struct kvm_enc_region *range)
>>  {
>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> -	struct enc_region *region;
>> -	int ret = 0;
>> -
>> -	if (!sev_guest(kvm))
>> -		return -ENOTTY;
>> -
>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>> -	if (is_mirroring_enc_context(kvm))
>> -		return -EINVAL;
>> -
>> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
>> -		return -EINVAL;
>> -
>> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>> -	if (!region)
>> -		return -ENOMEM;
>> -
>> -	mutex_lock(&kvm->lock);
>> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
>> -	if (IS_ERR(region->pages)) {
>> -		ret = PTR_ERR(region->pages);
>> -		mutex_unlock(&kvm->lock);
>> -		goto e_free;
>> -	}
>> -
>> -	region->uaddr = range->addr;
>> -	region->size = range->size;
>> -
>> -	list_add_tail(&region->list, &sev->regions_list);
>> -	mutex_unlock(&kvm->lock);
>> -
>> -	/*
>> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
>> -	 * or vice versa for this memory range. Lets make sure caches are
>> -	 * flushed to ensure that guest data gets written into memory with
>> -	 * correct C-bit.
>> -	 */
>> -	sev_clflush_pages(region->pages, region->npages);
>> -
>> -	return ret;
>> -
>> -e_free:
>> -	kfree(region);
>> -	return ret;
>> -}
>> -
>> -static struct enc_region *
>> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
>> -{
>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> -	struct list_head *head = &sev->regions_list;
>> -	struct enc_region *i;
>> -
>> -	list_for_each_entry(i, head, list) {
>> -		if (i->uaddr == range->addr &&
>> -		    i->size == range->size)
>> -			return i;
>> -	}
>> -
>> -	return NULL;
>> -}
>> -
>> -static void __unregister_enc_region_locked(struct kvm *kvm,
>> -					   struct enc_region *region)
>> -{
>> -	sev_unpin_memory(kvm, region->pages, region->npages);
>> -	list_del(&region->list);
>> -	kfree(region);
>> +	return 0;
>>  }
>>  
>>  int svm_unregister_enc_region(struct kvm *kvm,
>>  			      struct kvm_enc_region *range)
>>  {
>> -	struct enc_region *region;
>> -	int ret;
>> -
>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>> -	if (is_mirroring_enc_context(kvm))
>> -		return -EINVAL;
>> -
>> -	mutex_lock(&kvm->lock);
>> -
>> -	if (!sev_guest(kvm)) {
>> -		ret = -ENOTTY;
>> -		goto failed;
>> -	}
>> -
>> -	region = find_enc_region(kvm, range);
>> -	if (!region) {
>> -		ret = -EINVAL;
>> -		goto failed;
>> -	}
>> -
>> -	/*
>> -	 * Ensure that all guest tagged cache entries are flushed before
>> -	 * releasing the pages back to the system for use. CLFLUSH will
>> -	 * not do this, so issue a WBINVD.
>> -	 */
>> -	wbinvd_on_all_cpus();
>> -
>> -	__unregister_enc_region_locked(kvm, region);
>> -
>> -	mutex_unlock(&kvm->lock);
>>  	return 0;
>> -
>> -failed:
>> -	mutex_unlock(&kvm->lock);
>> -	return ret;
>>  }
>>  
>>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>  	mirror_sev->fd = source_sev->fd;
>>  	mirror_sev->es_active = source_sev->es_active;
>>  	mirror_sev->handle = source_sev->handle;
>> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>>  	ret = 0;
>>  
>>  	/*
>> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>  void sev_vm_destroy(struct kvm *kvm)
>>  {
>>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> -	struct list_head *head = &sev->regions_list;
>> -	struct list_head *pos, *q;
>>  
>>  	WARN_ON(sev->num_mirrored_vms);
>>  
>> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>>  	 */
>>  	wbinvd_on_all_cpus();
>>  
>> -	/*
>> -	 * if userspace was terminated before unregistering the memory regions
>> -	 * then lets unpin all the registered memory.
>> -	 */
>> -	if (!list_empty(head)) {
>> -		list_for_each_safe(pos, q, head) {
>> -			__unregister_enc_region_locked(kvm,
>> -				list_entry(pos, struct enc_region, list));
>> -			cond_resched();
>> -		}
>> -	}
>> -
>>  	sev_unbind_asid(kvm, sev->handle);
>>  	sev_asid_free(sev);
>>  }
>> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>>  }
>>  
>> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>> +		  kvm_pfn_t pfn)
>> +{
>> +	struct kvm_arch_memory_slot *aslot;
>> +	struct kvm_memory_slot *slot;
>> +	gfn_t rel_gfn, pin_pfn;
>> +	unsigned long npages;
>> +	kvm_pfn_t old_pfn;
>> +	int i;
>> +
>> +	if (!sev_guest(kvm))
>> +		return;
>> +
>> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
>> +		return;
>> +
>> +	/* Tested till 1GB pages */
>> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
>> +		return;
>> +
>> +	slot = gfn_to_memslot(kvm, gfn);
>> +	if (!slot || !slot->arch.pfns)
>> +		return;
>> +
>> +	/*
>> +	 * Use relative gfn index within the memslot for the bitmap as well as
>> +	 * the pfns array
>> +	 */
>> +	rel_gfn = gfn - slot->base_gfn;
>> +	aslot = &slot->arch;
>> +	pin_pfn = pfn;
>> +	npages = KVM_PAGES_PER_HPAGE(level);
>> +
>> +	/* Pin the page, KVM doesn't yet support page migration. */
>> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
>> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>> +			old_pfn = aslot->pfns[rel_gfn];
>> +			if (old_pfn == pin_pfn)
>> +				continue;
>> +
>> +			put_page(pfn_to_page(old_pfn));
>> +		}
>> +
>> +		set_bit(rel_gfn, aslot->pinned_bitmap);
>> +		aslot->pfns[rel_gfn] = pin_pfn;
>> +		get_page(pfn_to_page(pin_pfn));
> 
> 
> I assume this is to replace KVM_MEMORY_ENCRYPT_REG_REGION, which ends up
> calling svm_register_enc_region()->sev_pin_memory(), correct?

Yes, that is correct.
> 
> sev_pin_memory() correctly checks the RLIMIT_MEMLOCK and uses
> pin_user_pages_fast().
> 
> I have to strongly assume that sev_pin_memory() is *wrong* as is because
> it's supposed to supply FOLL_LONGTERM -- after all we're pinning these
> pages possibly forever.
> 
> 
> I might be wrong but
> 
> 1. You are missing the RLIMIT_MEMLOCK check

Yes, I will add this check during the enc_region registration.

> 2. get_page() is the wong way of long-term pinning a page. You would
> have to mimic what pin_user_pages_fast(FOLL_LONGTERM) does to eventually
> get it right (e.g., migrate the page off of MIGRATE_CMA or ZONE_MOVABLE).

Let me go through this and I will come back. Thanks for pointing this out.

Regards
Nikunj
David Hildenbrand Jan. 28, 2022, 8:27 a.m. UTC | #7
On 28.01.22 07:57, Nikunj A. Dadhania wrote:
> On 1/26/2022 4:16 PM, David Hildenbrand wrote:
>> On 18.01.22 12:06, Nikunj A Dadhania wrote:
>>> Use the memslot metadata to store the pinned data along with the pfns.
>>> This improves the SEV guest startup time from O(n) to a constant by
>>> deferring guest page pinning until the pages are used to satisfy nested
>>> page faults. The page reference will be dropped in the memslot free
>>> path.
>>>
>>> Remove the enc_region structure definition and the code which did
>>> upfront pinning, as they are no longer needed in view of the demand
>>> pinning support.
>>>
>>> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
>>> since qemu is dependent on this API.
>>>
>>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>>> ---
>>>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>>>  arch/x86/kvm/svm/svm.c |   1 +
>>>  arch/x86/kvm/svm/svm.h |   3 +-
>>>  3 files changed, 81 insertions(+), 131 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>>> index d972ab4956d4..a962bed97a0b 100644
>>> --- a/arch/x86/kvm/svm/sev.c
>>> +++ b/arch/x86/kvm/svm/sev.c
>>> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>>>  static unsigned long *sev_asid_bitmap;
>>>  static unsigned long *sev_reclaim_asid_bitmap;
>>>  
>>> -struct enc_region {
>>> -	struct list_head list;
>>> -	unsigned long npages;
>>> -	struct page **pages;
>>> -	unsigned long uaddr;
>>> -	unsigned long size;
>>> -};
>>> -
>>>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>>>  static int sev_flush_asids(int min_asid, int max_asid)
>>>  {
>>> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>  	if (ret)
>>>  		goto e_free;
>>>  
>>> -	INIT_LIST_HEAD(&sev->regions_list);
>>> -
>>>  	return 0;
>>>  
>>>  e_free:
>>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>>  	src->handle = 0;
>>>  	src->pages_locked = 0;
>>>  	src->enc_context_owner = NULL;
>>> -
>>> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>>>  }
>>>  
>>>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
>>> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>  int svm_register_enc_region(struct kvm *kvm,
>>>  			    struct kvm_enc_region *range)
>>>  {
>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>> -	struct enc_region *region;
>>> -	int ret = 0;
>>> -
>>> -	if (!sev_guest(kvm))
>>> -		return -ENOTTY;
>>> -
>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>> -	if (is_mirroring_enc_context(kvm))
>>> -		return -EINVAL;
>>> -
>>> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
>>> -		return -EINVAL;
>>> -
>>> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>>> -	if (!region)
>>> -		return -ENOMEM;
>>> -
>>> -	mutex_lock(&kvm->lock);
>>> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
>>> -	if (IS_ERR(region->pages)) {
>>> -		ret = PTR_ERR(region->pages);
>>> -		mutex_unlock(&kvm->lock);
>>> -		goto e_free;
>>> -	}
>>> -
>>> -	region->uaddr = range->addr;
>>> -	region->size = range->size;
>>> -
>>> -	list_add_tail(&region->list, &sev->regions_list);
>>> -	mutex_unlock(&kvm->lock);
>>> -
>>> -	/*
>>> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
>>> -	 * or vice versa for this memory range. Lets make sure caches are
>>> -	 * flushed to ensure that guest data gets written into memory with
>>> -	 * correct C-bit.
>>> -	 */
>>> -	sev_clflush_pages(region->pages, region->npages);
>>> -
>>> -	return ret;
>>> -
>>> -e_free:
>>> -	kfree(region);
>>> -	return ret;
>>> -}
>>> -
>>> -static struct enc_region *
>>> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
>>> -{
>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>> -	struct list_head *head = &sev->regions_list;
>>> -	struct enc_region *i;
>>> -
>>> -	list_for_each_entry(i, head, list) {
>>> -		if (i->uaddr == range->addr &&
>>> -		    i->size == range->size)
>>> -			return i;
>>> -	}
>>> -
>>> -	return NULL;
>>> -}
>>> -
>>> -static void __unregister_enc_region_locked(struct kvm *kvm,
>>> -					   struct enc_region *region)
>>> -{
>>> -	sev_unpin_memory(kvm, region->pages, region->npages);
>>> -	list_del(&region->list);
>>> -	kfree(region);
>>> +	return 0;
>>>  }
>>>  
>>>  int svm_unregister_enc_region(struct kvm *kvm,
>>>  			      struct kvm_enc_region *range)
>>>  {
>>> -	struct enc_region *region;
>>> -	int ret;
>>> -
>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>> -	if (is_mirroring_enc_context(kvm))
>>> -		return -EINVAL;
>>> -
>>> -	mutex_lock(&kvm->lock);
>>> -
>>> -	if (!sev_guest(kvm)) {
>>> -		ret = -ENOTTY;
>>> -		goto failed;
>>> -	}
>>> -
>>> -	region = find_enc_region(kvm, range);
>>> -	if (!region) {
>>> -		ret = -EINVAL;
>>> -		goto failed;
>>> -	}
>>> -
>>> -	/*
>>> -	 * Ensure that all guest tagged cache entries are flushed before
>>> -	 * releasing the pages back to the system for use. CLFLUSH will
>>> -	 * not do this, so issue a WBINVD.
>>> -	 */
>>> -	wbinvd_on_all_cpus();
>>> -
>>> -	__unregister_enc_region_locked(kvm, region);
>>> -
>>> -	mutex_unlock(&kvm->lock);
>>>  	return 0;
>>> -
>>> -failed:
>>> -	mutex_unlock(&kvm->lock);
>>> -	return ret;
>>>  }
>>>  
>>>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>  	mirror_sev->fd = source_sev->fd;
>>>  	mirror_sev->es_active = source_sev->es_active;
>>>  	mirror_sev->handle = source_sev->handle;
>>> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>>>  	ret = 0;
>>>  
>>>  	/*
>>> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>  void sev_vm_destroy(struct kvm *kvm)
>>>  {
>>>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>> -	struct list_head *head = &sev->regions_list;
>>> -	struct list_head *pos, *q;
>>>  
>>>  	WARN_ON(sev->num_mirrored_vms);
>>>  
>>> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>>>  	 */
>>>  	wbinvd_on_all_cpus();
>>>  
>>> -	/*
>>> -	 * if userspace was terminated before unregistering the memory regions
>>> -	 * then lets unpin all the registered memory.
>>> -	 */
>>> -	if (!list_empty(head)) {
>>> -		list_for_each_safe(pos, q, head) {
>>> -			__unregister_enc_region_locked(kvm,
>>> -				list_entry(pos, struct enc_region, list));
>>> -			cond_resched();
>>> -		}
>>> -	}
>>> -
>>>  	sev_unbind_asid(kvm, sev->handle);
>>>  	sev_asid_free(sev);
>>>  }
>>> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>>>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>>>  }
>>>  
>>> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>>> +		  kvm_pfn_t pfn)
>>> +{
>>> +	struct kvm_arch_memory_slot *aslot;
>>> +	struct kvm_memory_slot *slot;
>>> +	gfn_t rel_gfn, pin_pfn;
>>> +	unsigned long npages;
>>> +	kvm_pfn_t old_pfn;
>>> +	int i;
>>> +
>>> +	if (!sev_guest(kvm))
>>> +		return;
>>> +
>>> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
>>> +		return;
>>> +
>>> +	/* Tested till 1GB pages */
>>> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
>>> +		return;
>>> +
>>> +	slot = gfn_to_memslot(kvm, gfn);
>>> +	if (!slot || !slot->arch.pfns)
>>> +		return;
>>> +
>>> +	/*
>>> +	 * Use relative gfn index within the memslot for the bitmap as well as
>>> +	 * the pfns array
>>> +	 */
>>> +	rel_gfn = gfn - slot->base_gfn;
>>> +	aslot = &slot->arch;
>>> +	pin_pfn = pfn;
>>> +	npages = KVM_PAGES_PER_HPAGE(level);
>>> +
>>> +	/* Pin the page, KVM doesn't yet support page migration. */
>>> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
>>> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>>> +			old_pfn = aslot->pfns[rel_gfn];
>>> +			if (old_pfn == pin_pfn)
>>> +				continue;
>>> +
>>> +			put_page(pfn_to_page(old_pfn));
>>> +		}
>>> +
>>> +		set_bit(rel_gfn, aslot->pinned_bitmap);
>>> +		aslot->pfns[rel_gfn] = pin_pfn;
>>> +		get_page(pfn_to_page(pin_pfn));
>>
>>
>> I assume this is to replace KVM_MEMORY_ENCRYPT_REG_REGION, which ends up
>> calling svm_register_enc_region()->sev_pin_memory(), correct?
> 
> Yes, that is correct.
>>
>> sev_pin_memory() correctly checks the RLIMIT_MEMLOCK and uses
>> pin_user_pages_fast().
>>
>> I have to strongly assume that sev_pin_memory() is *wrong* as is because
>> it's supposed to supply FOLL_LONGTERM -- after all we're pinning these
>> pages possibly forever.
>>
>>
>> I might be wrong but
>>
>> 1. You are missing the RLIMIT_MEMLOCK check
> 
> Yes, I will add this check during the enc_region registration.
> 
>> 2. get_page() is the wong way of long-term pinning a page. You would
>> have to mimic what pin_user_pages_fast(FOLL_LONGTERM) does to eventually
>> get it right (e.g., migrate the page off of MIGRATE_CMA or ZONE_MOVABLE).
> 
> Let me go through this and I will come back. Thanks for pointing this out.

I asusme the "issue" is that KVM uses mmu notifier and does a simple
get_user_pages() to obtain the references, to drop the reference when
the entry is invalidated via a mmu notifier call. So once you intent to
long-term pin, it's already to late.

If you could teach KVM to do a long-term pin when stumbling over these
special encrypted memory regions (requires a proper matching
unpin_user_pages() call from KVM), then you could "take over" that pin
by get_page(), and let KVM do the ordinary put_page(), while you would
do the unpin_user_pages().
Nikunj A. Dadhania Jan. 28, 2022, 11:04 a.m. UTC | #8
On 1/28/2022 1:57 PM, David Hildenbrand wrote:
> On 28.01.22 07:57, Nikunj A. Dadhania wrote:
>> On 1/26/2022 4:16 PM, David Hildenbrand wrote:
>>> On 18.01.22 12:06, Nikunj A Dadhania wrote:
>>>> Use the memslot metadata to store the pinned data along with the pfns.
>>>> This improves the SEV guest startup time from O(n) to a constant by
>>>> deferring guest page pinning until the pages are used to satisfy nested
>>>> page faults. The page reference will be dropped in the memslot free
>>>> path.
>>>>
>>>> Remove the enc_region structure definition and the code which did
>>>> upfront pinning, as they are no longer needed in view of the demand
>>>> pinning support.
>>>>
>>>> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
>>>> since qemu is dependent on this API.
>>>>
>>>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>>>> ---
>>>>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>>>>  arch/x86/kvm/svm/svm.c |   1 +
>>>>  arch/x86/kvm/svm/svm.h |   3 +-
>>>>  3 files changed, 81 insertions(+), 131 deletions(-)
>>>>
>>>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>>>> index d972ab4956d4..a962bed97a0b 100644
>>>> --- a/arch/x86/kvm/svm/sev.c
>>>> +++ b/arch/x86/kvm/svm/sev.c
>>>> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>>>>  static unsigned long *sev_asid_bitmap;
>>>>  static unsigned long *sev_reclaim_asid_bitmap;
>>>>  
>>>> -struct enc_region {
>>>> -	struct list_head list;
>>>> -	unsigned long npages;
>>>> -	struct page **pages;
>>>> -	unsigned long uaddr;
>>>> -	unsigned long size;
>>>> -};
>>>> -
>>>>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>>>>  static int sev_flush_asids(int min_asid, int max_asid)
>>>>  {
>>>> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>>  	if (ret)
>>>>  		goto e_free;
>>>>  
>>>> -	INIT_LIST_HEAD(&sev->regions_list);
>>>> -
>>>>  	return 0;
>>>>  
>>>>  e_free:
>>>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>>>  	src->handle = 0;
>>>>  	src->pages_locked = 0;
>>>>  	src->enc_context_owner = NULL;
>>>> -
>>>> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>>>>  }
>>>>  
>>>>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
>>>> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>>  int svm_register_enc_region(struct kvm *kvm,
>>>>  			    struct kvm_enc_region *range)
>>>>  {
>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>> -	struct enc_region *region;
>>>> -	int ret = 0;
>>>> -
>>>> -	if (!sev_guest(kvm))
>>>> -		return -ENOTTY;
>>>> -
>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>> -	if (is_mirroring_enc_context(kvm))
>>>> -		return -EINVAL;
>>>> -
>>>> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
>>>> -		return -EINVAL;
>>>> -
>>>> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>>>> -	if (!region)
>>>> -		return -ENOMEM;
>>>> -
>>>> -	mutex_lock(&kvm->lock);
>>>> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
>>>> -	if (IS_ERR(region->pages)) {
>>>> -		ret = PTR_ERR(region->pages);
>>>> -		mutex_unlock(&kvm->lock);
>>>> -		goto e_free;
>>>> -	}
>>>> -
>>>> -	region->uaddr = range->addr;
>>>> -	region->size = range->size;
>>>> -
>>>> -	list_add_tail(&region->list, &sev->regions_list);
>>>> -	mutex_unlock(&kvm->lock);
>>>> -
>>>> -	/*
>>>> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
>>>> -	 * or vice versa for this memory range. Lets make sure caches are
>>>> -	 * flushed to ensure that guest data gets written into memory with
>>>> -	 * correct C-bit.
>>>> -	 */
>>>> -	sev_clflush_pages(region->pages, region->npages);
>>>> -
>>>> -	return ret;
>>>> -
>>>> -e_free:
>>>> -	kfree(region);
>>>> -	return ret;
>>>> -}
>>>> -
>>>> -static struct enc_region *
>>>> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
>>>> -{
>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>> -	struct list_head *head = &sev->regions_list;
>>>> -	struct enc_region *i;
>>>> -
>>>> -	list_for_each_entry(i, head, list) {
>>>> -		if (i->uaddr == range->addr &&
>>>> -		    i->size == range->size)
>>>> -			return i;
>>>> -	}
>>>> -
>>>> -	return NULL;
>>>> -}
>>>> -
>>>> -static void __unregister_enc_region_locked(struct kvm *kvm,
>>>> -					   struct enc_region *region)
>>>> -{
>>>> -	sev_unpin_memory(kvm, region->pages, region->npages);
>>>> -	list_del(&region->list);
>>>> -	kfree(region);
>>>> +	return 0;
>>>>  }
>>>>  
>>>>  int svm_unregister_enc_region(struct kvm *kvm,
>>>>  			      struct kvm_enc_region *range)
>>>>  {
>>>> -	struct enc_region *region;
>>>> -	int ret;
>>>> -
>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>> -	if (is_mirroring_enc_context(kvm))
>>>> -		return -EINVAL;
>>>> -
>>>> -	mutex_lock(&kvm->lock);
>>>> -
>>>> -	if (!sev_guest(kvm)) {
>>>> -		ret = -ENOTTY;
>>>> -		goto failed;
>>>> -	}
>>>> -
>>>> -	region = find_enc_region(kvm, range);
>>>> -	if (!region) {
>>>> -		ret = -EINVAL;
>>>> -		goto failed;
>>>> -	}
>>>> -
>>>> -	/*
>>>> -	 * Ensure that all guest tagged cache entries are flushed before
>>>> -	 * releasing the pages back to the system for use. CLFLUSH will
>>>> -	 * not do this, so issue a WBINVD.
>>>> -	 */
>>>> -	wbinvd_on_all_cpus();
>>>> -
>>>> -	__unregister_enc_region_locked(kvm, region);
>>>> -
>>>> -	mutex_unlock(&kvm->lock);
>>>>  	return 0;
>>>> -
>>>> -failed:
>>>> -	mutex_unlock(&kvm->lock);
>>>> -	return ret;
>>>>  }
>>>>  
>>>>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>  	mirror_sev->fd = source_sev->fd;
>>>>  	mirror_sev->es_active = source_sev->es_active;
>>>>  	mirror_sev->handle = source_sev->handle;
>>>> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>>>>  	ret = 0;
>>>>  
>>>>  	/*
>>>> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>  void sev_vm_destroy(struct kvm *kvm)
>>>>  {
>>>>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>> -	struct list_head *head = &sev->regions_list;
>>>> -	struct list_head *pos, *q;
>>>>  
>>>>  	WARN_ON(sev->num_mirrored_vms);
>>>>  
>>>> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>>>>  	 */
>>>>  	wbinvd_on_all_cpus();
>>>>  
>>>> -	/*
>>>> -	 * if userspace was terminated before unregistering the memory regions
>>>> -	 * then lets unpin all the registered memory.
>>>> -	 */
>>>> -	if (!list_empty(head)) {
>>>> -		list_for_each_safe(pos, q, head) {
>>>> -			__unregister_enc_region_locked(kvm,
>>>> -				list_entry(pos, struct enc_region, list));
>>>> -			cond_resched();
>>>> -		}
>>>> -	}
>>>> -
>>>>  	sev_unbind_asid(kvm, sev->handle);
>>>>  	sev_asid_free(sev);
>>>>  }
>>>> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>>>>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>>>>  }
>>>>  
>>>> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>>>> +		  kvm_pfn_t pfn)
>>>> +{
>>>> +	struct kvm_arch_memory_slot *aslot;
>>>> +	struct kvm_memory_slot *slot;
>>>> +	gfn_t rel_gfn, pin_pfn;
>>>> +	unsigned long npages;
>>>> +	kvm_pfn_t old_pfn;
>>>> +	int i;
>>>> +
>>>> +	if (!sev_guest(kvm))
>>>> +		return;
>>>> +
>>>> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
>>>> +		return;
>>>> +
>>>> +	/* Tested till 1GB pages */
>>>> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
>>>> +		return;
>>>> +
>>>> +	slot = gfn_to_memslot(kvm, gfn);
>>>> +	if (!slot || !slot->arch.pfns)
>>>> +		return;
>>>> +
>>>> +	/*
>>>> +	 * Use relative gfn index within the memslot for the bitmap as well as
>>>> +	 * the pfns array
>>>> +	 */
>>>> +	rel_gfn = gfn - slot->base_gfn;
>>>> +	aslot = &slot->arch;
>>>> +	pin_pfn = pfn;
>>>> +	npages = KVM_PAGES_PER_HPAGE(level);
>>>> +
>>>> +	/* Pin the page, KVM doesn't yet support page migration. */
>>>> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
>>>> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>>>> +			old_pfn = aslot->pfns[rel_gfn];
>>>> +			if (old_pfn == pin_pfn)
>>>> +				continue;
>>>> +
>>>> +			put_page(pfn_to_page(old_pfn));
>>>> +		}
>>>> +
>>>> +		set_bit(rel_gfn, aslot->pinned_bitmap);
>>>> +		aslot->pfns[rel_gfn] = pin_pfn;
>>>> +		get_page(pfn_to_page(pin_pfn));
>>>
>>>
>>> I assume this is to replace KVM_MEMORY_ENCRYPT_REG_REGION, which ends up
>>> calling svm_register_enc_region()->sev_pin_memory(), correct?
>>
>> Yes, that is correct.
>>>
>>> sev_pin_memory() correctly checks the RLIMIT_MEMLOCK and uses
>>> pin_user_pages_fast().
>>>
>>> I have to strongly assume that sev_pin_memory() is *wrong* as is because
>>> it's supposed to supply FOLL_LONGTERM -- after all we're pinning these
>>> pages possibly forever.
>>>
>>>
>>> I might be wrong but
>>>
>>> 1. You are missing the RLIMIT_MEMLOCK check
>>
>> Yes, I will add this check during the enc_region registration.
>>
>>> 2. get_page() is the wong way of long-term pinning a page. You would
>>> have to mimic what pin_user_pages_fast(FOLL_LONGTERM) does to eventually
>>> get it right (e.g., migrate the page off of MIGRATE_CMA or ZONE_MOVABLE).
>>
>> Let me go through this and I will come back. Thanks for pointing this out.
> 
> I asusme the "issue" is that KVM uses mmu notifier and does a simple
> get_user_pages() to obtain the references, to drop the reference when
> the entry is invalidated via a mmu notifier call. So once you intent to
> long-term pin, it's already to late.
> 
> If you could teach KVM to do a long-term pin when stumbling over these
> special encrypted memory regions (requires a proper matching
> unpin_user_pages() call from KVM), then you could "take over" that pin
> by get_page(), and let KVM do the ordinary put_page(), while you would
> do the unpin_user_pages().
> 

The fault path looks like this in KVM x86 mmu code:

direct_page_fault()
-> kvm_faultin_pfn()
   -> __gfn_to_pfn_memslot()
      -> hva_to_pfn()
         -> hva_to_pfn_{slow,fast}()
            -> get_user_pages_*()      <<<<==== This is where the
                                                reference is taken

Next step is to create the mappings which is done in below functions:

-> kvm_tdp_mmu_map() / __direct_map()

   -> Within this function (patch 1/6), I call sev_pin_spte to take an extra 
      reference to pin it using get_page. 

      Is it possible to use pin_user_pages(FOLL_LONGTERM) here? Wouldn't that 
      be equivalent to "take over" solution that you are suggesting?

Reference is released when direct_page_fault() completes using put_page()

Later when the SEV VM is shutting down, I can do unpin_user_pages() for the 
pinned pages.

Regards
Nikunj
David Hildenbrand Jan. 28, 2022, 11:08 a.m. UTC | #9
On 28.01.22 12:04, Nikunj A. Dadhania wrote:
> On 1/28/2022 1:57 PM, David Hildenbrand wrote:
>> On 28.01.22 07:57, Nikunj A. Dadhania wrote:
>>> On 1/26/2022 4:16 PM, David Hildenbrand wrote:
>>>> On 18.01.22 12:06, Nikunj A Dadhania wrote:
>>>>> Use the memslot metadata to store the pinned data along with the pfns.
>>>>> This improves the SEV guest startup time from O(n) to a constant by
>>>>> deferring guest page pinning until the pages are used to satisfy nested
>>>>> page faults. The page reference will be dropped in the memslot free
>>>>> path.
>>>>>
>>>>> Remove the enc_region structure definition and the code which did
>>>>> upfront pinning, as they are no longer needed in view of the demand
>>>>> pinning support.
>>>>>
>>>>> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
>>>>> since qemu is dependent on this API.
>>>>>
>>>>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>>>>> ---
>>>>>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>>>>>  arch/x86/kvm/svm/svm.c |   1 +
>>>>>  arch/x86/kvm/svm/svm.h |   3 +-
>>>>>  3 files changed, 81 insertions(+), 131 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>>>>> index d972ab4956d4..a962bed97a0b 100644
>>>>> --- a/arch/x86/kvm/svm/sev.c
>>>>> +++ b/arch/x86/kvm/svm/sev.c
>>>>> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>>>>>  static unsigned long *sev_asid_bitmap;
>>>>>  static unsigned long *sev_reclaim_asid_bitmap;
>>>>>  
>>>>> -struct enc_region {
>>>>> -	struct list_head list;
>>>>> -	unsigned long npages;
>>>>> -	struct page **pages;
>>>>> -	unsigned long uaddr;
>>>>> -	unsigned long size;
>>>>> -};
>>>>> -
>>>>>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>>>>>  static int sev_flush_asids(int min_asid, int max_asid)
>>>>>  {
>>>>> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>>>  	if (ret)
>>>>>  		goto e_free;
>>>>>  
>>>>> -	INIT_LIST_HEAD(&sev->regions_list);
>>>>> -
>>>>>  	return 0;
>>>>>  
>>>>>  e_free:
>>>>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>>>>  	src->handle = 0;
>>>>>  	src->pages_locked = 0;
>>>>>  	src->enc_context_owner = NULL;
>>>>> -
>>>>> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>>>>>  }
>>>>>  
>>>>>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
>>>>> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>>>  int svm_register_enc_region(struct kvm *kvm,
>>>>>  			    struct kvm_enc_region *range)
>>>>>  {
>>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>> -	struct enc_region *region;
>>>>> -	int ret = 0;
>>>>> -
>>>>> -	if (!sev_guest(kvm))
>>>>> -		return -ENOTTY;
>>>>> -
>>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>>> -	if (is_mirroring_enc_context(kvm))
>>>>> -		return -EINVAL;
>>>>> -
>>>>> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
>>>>> -		return -EINVAL;
>>>>> -
>>>>> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>>>>> -	if (!region)
>>>>> -		return -ENOMEM;
>>>>> -
>>>>> -	mutex_lock(&kvm->lock);
>>>>> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
>>>>> -	if (IS_ERR(region->pages)) {
>>>>> -		ret = PTR_ERR(region->pages);
>>>>> -		mutex_unlock(&kvm->lock);
>>>>> -		goto e_free;
>>>>> -	}
>>>>> -
>>>>> -	region->uaddr = range->addr;
>>>>> -	region->size = range->size;
>>>>> -
>>>>> -	list_add_tail(&region->list, &sev->regions_list);
>>>>> -	mutex_unlock(&kvm->lock);
>>>>> -
>>>>> -	/*
>>>>> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
>>>>> -	 * or vice versa for this memory range. Lets make sure caches are
>>>>> -	 * flushed to ensure that guest data gets written into memory with
>>>>> -	 * correct C-bit.
>>>>> -	 */
>>>>> -	sev_clflush_pages(region->pages, region->npages);
>>>>> -
>>>>> -	return ret;
>>>>> -
>>>>> -e_free:
>>>>> -	kfree(region);
>>>>> -	return ret;
>>>>> -}
>>>>> -
>>>>> -static struct enc_region *
>>>>> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
>>>>> -{
>>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>> -	struct list_head *head = &sev->regions_list;
>>>>> -	struct enc_region *i;
>>>>> -
>>>>> -	list_for_each_entry(i, head, list) {
>>>>> -		if (i->uaddr == range->addr &&
>>>>> -		    i->size == range->size)
>>>>> -			return i;
>>>>> -	}
>>>>> -
>>>>> -	return NULL;
>>>>> -}
>>>>> -
>>>>> -static void __unregister_enc_region_locked(struct kvm *kvm,
>>>>> -					   struct enc_region *region)
>>>>> -{
>>>>> -	sev_unpin_memory(kvm, region->pages, region->npages);
>>>>> -	list_del(&region->list);
>>>>> -	kfree(region);
>>>>> +	return 0;
>>>>>  }
>>>>>  
>>>>>  int svm_unregister_enc_region(struct kvm *kvm,
>>>>>  			      struct kvm_enc_region *range)
>>>>>  {
>>>>> -	struct enc_region *region;
>>>>> -	int ret;
>>>>> -
>>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>>> -	if (is_mirroring_enc_context(kvm))
>>>>> -		return -EINVAL;
>>>>> -
>>>>> -	mutex_lock(&kvm->lock);
>>>>> -
>>>>> -	if (!sev_guest(kvm)) {
>>>>> -		ret = -ENOTTY;
>>>>> -		goto failed;
>>>>> -	}
>>>>> -
>>>>> -	region = find_enc_region(kvm, range);
>>>>> -	if (!region) {
>>>>> -		ret = -EINVAL;
>>>>> -		goto failed;
>>>>> -	}
>>>>> -
>>>>> -	/*
>>>>> -	 * Ensure that all guest tagged cache entries are flushed before
>>>>> -	 * releasing the pages back to the system for use. CLFLUSH will
>>>>> -	 * not do this, so issue a WBINVD.
>>>>> -	 */
>>>>> -	wbinvd_on_all_cpus();
>>>>> -
>>>>> -	__unregister_enc_region_locked(kvm, region);
>>>>> -
>>>>> -	mutex_unlock(&kvm->lock);
>>>>>  	return 0;
>>>>> -
>>>>> -failed:
>>>>> -	mutex_unlock(&kvm->lock);
>>>>> -	return ret;
>>>>>  }
>>>>>  
>>>>>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>  	mirror_sev->fd = source_sev->fd;
>>>>>  	mirror_sev->es_active = source_sev->es_active;
>>>>>  	mirror_sev->handle = source_sev->handle;
>>>>> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>>>>>  	ret = 0;
>>>>>  
>>>>>  	/*
>>>>> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>  void sev_vm_destroy(struct kvm *kvm)
>>>>>  {
>>>>>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>> -	struct list_head *head = &sev->regions_list;
>>>>> -	struct list_head *pos, *q;
>>>>>  
>>>>>  	WARN_ON(sev->num_mirrored_vms);
>>>>>  
>>>>> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>>>>>  	 */
>>>>>  	wbinvd_on_all_cpus();
>>>>>  
>>>>> -	/*
>>>>> -	 * if userspace was terminated before unregistering the memory regions
>>>>> -	 * then lets unpin all the registered memory.
>>>>> -	 */
>>>>> -	if (!list_empty(head)) {
>>>>> -		list_for_each_safe(pos, q, head) {
>>>>> -			__unregister_enc_region_locked(kvm,
>>>>> -				list_entry(pos, struct enc_region, list));
>>>>> -			cond_resched();
>>>>> -		}
>>>>> -	}
>>>>> -
>>>>>  	sev_unbind_asid(kvm, sev->handle);
>>>>>  	sev_asid_free(sev);
>>>>>  }
>>>>> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>>>>>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>>>>>  }
>>>>>  
>>>>> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>>>>> +		  kvm_pfn_t pfn)
>>>>> +{
>>>>> +	struct kvm_arch_memory_slot *aslot;
>>>>> +	struct kvm_memory_slot *slot;
>>>>> +	gfn_t rel_gfn, pin_pfn;
>>>>> +	unsigned long npages;
>>>>> +	kvm_pfn_t old_pfn;
>>>>> +	int i;
>>>>> +
>>>>> +	if (!sev_guest(kvm))
>>>>> +		return;
>>>>> +
>>>>> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
>>>>> +		return;
>>>>> +
>>>>> +	/* Tested till 1GB pages */
>>>>> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
>>>>> +		return;
>>>>> +
>>>>> +	slot = gfn_to_memslot(kvm, gfn);
>>>>> +	if (!slot || !slot->arch.pfns)
>>>>> +		return;
>>>>> +
>>>>> +	/*
>>>>> +	 * Use relative gfn index within the memslot for the bitmap as well as
>>>>> +	 * the pfns array
>>>>> +	 */
>>>>> +	rel_gfn = gfn - slot->base_gfn;
>>>>> +	aslot = &slot->arch;
>>>>> +	pin_pfn = pfn;
>>>>> +	npages = KVM_PAGES_PER_HPAGE(level);
>>>>> +
>>>>> +	/* Pin the page, KVM doesn't yet support page migration. */
>>>>> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
>>>>> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>>>>> +			old_pfn = aslot->pfns[rel_gfn];
>>>>> +			if (old_pfn == pin_pfn)
>>>>> +				continue;
>>>>> +
>>>>> +			put_page(pfn_to_page(old_pfn));
>>>>> +		}
>>>>> +
>>>>> +		set_bit(rel_gfn, aslot->pinned_bitmap);
>>>>> +		aslot->pfns[rel_gfn] = pin_pfn;
>>>>> +		get_page(pfn_to_page(pin_pfn));
>>>>
>>>>
>>>> I assume this is to replace KVM_MEMORY_ENCRYPT_REG_REGION, which ends up
>>>> calling svm_register_enc_region()->sev_pin_memory(), correct?
>>>
>>> Yes, that is correct.
>>>>
>>>> sev_pin_memory() correctly checks the RLIMIT_MEMLOCK and uses
>>>> pin_user_pages_fast().
>>>>
>>>> I have to strongly assume that sev_pin_memory() is *wrong* as is because
>>>> it's supposed to supply FOLL_LONGTERM -- after all we're pinning these
>>>> pages possibly forever.
>>>>
>>>>
>>>> I might be wrong but
>>>>
>>>> 1. You are missing the RLIMIT_MEMLOCK check
>>>
>>> Yes, I will add this check during the enc_region registration.
>>>
>>>> 2. get_page() is the wong way of long-term pinning a page. You would
>>>> have to mimic what pin_user_pages_fast(FOLL_LONGTERM) does to eventually
>>>> get it right (e.g., migrate the page off of MIGRATE_CMA or ZONE_MOVABLE).
>>>
>>> Let me go through this and I will come back. Thanks for pointing this out.
>>
>> I asusme the "issue" is that KVM uses mmu notifier and does a simple
>> get_user_pages() to obtain the references, to drop the reference when
>> the entry is invalidated via a mmu notifier call. So once you intent to
>> long-term pin, it's already to late.
>>
>> If you could teach KVM to do a long-term pin when stumbling over these
>> special encrypted memory regions (requires a proper matching
>> unpin_user_pages() call from KVM), then you could "take over" that pin
>> by get_page(), and let KVM do the ordinary put_page(), while you would
>> do the unpin_user_pages().
>>
> 
> The fault path looks like this in KVM x86 mmu code:
> 
> direct_page_fault()
> -> kvm_faultin_pfn()
>    -> __gfn_to_pfn_memslot()
>       -> hva_to_pfn()
>          -> hva_to_pfn_{slow,fast}()
>             -> get_user_pages_*()      <<<<==== This is where the
>                                                 reference is taken
> 
> Next step is to create the mappings which is done in below functions:
> 
> -> kvm_tdp_mmu_map() / __direct_map()
> 
>    -> Within this function (patch 1/6), I call sev_pin_spte to take an extra 
>       reference to pin it using get_page. 
> 
>       Is it possible to use pin_user_pages(FOLL_LONGTERM) here? Wouldn't that 
>       be equivalent to "take over" solution that you are suggesting?
> 

The issue is that pin_user_pages(FOLL_LONGTERM) might have to migrate
the page, which will fail if there is already an additional reference
from get_user_pages_*().
David Hildenbrand Jan. 31, 2022, 11:56 a.m. UTC | #10
On 28.01.22 12:08, David Hildenbrand wrote:
> On 28.01.22 12:04, Nikunj A. Dadhania wrote:
>> On 1/28/2022 1:57 PM, David Hildenbrand wrote:
>>> On 28.01.22 07:57, Nikunj A. Dadhania wrote:
>>>> On 1/26/2022 4:16 PM, David Hildenbrand wrote:
>>>>> On 18.01.22 12:06, Nikunj A Dadhania wrote:
>>>>>> Use the memslot metadata to store the pinned data along with the pfns.
>>>>>> This improves the SEV guest startup time from O(n) to a constant by
>>>>>> deferring guest page pinning until the pages are used to satisfy nested
>>>>>> page faults. The page reference will be dropped in the memslot free
>>>>>> path.
>>>>>>
>>>>>> Remove the enc_region structure definition and the code which did
>>>>>> upfront pinning, as they are no longer needed in view of the demand
>>>>>> pinning support.
>>>>>>
>>>>>> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
>>>>>> since qemu is dependent on this API.
>>>>>>
>>>>>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>>>>>> ---
>>>>>>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>>>>>>  arch/x86/kvm/svm/svm.c |   1 +
>>>>>>  arch/x86/kvm/svm/svm.h |   3 +-
>>>>>>  3 files changed, 81 insertions(+), 131 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>>>>>> index d972ab4956d4..a962bed97a0b 100644
>>>>>> --- a/arch/x86/kvm/svm/sev.c
>>>>>> +++ b/arch/x86/kvm/svm/sev.c
>>>>>> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>>>>>>  static unsigned long *sev_asid_bitmap;
>>>>>>  static unsigned long *sev_reclaim_asid_bitmap;
>>>>>>  
>>>>>> -struct enc_region {
>>>>>> -	struct list_head list;
>>>>>> -	unsigned long npages;
>>>>>> -	struct page **pages;
>>>>>> -	unsigned long uaddr;
>>>>>> -	unsigned long size;
>>>>>> -};
>>>>>> -
>>>>>>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>>>>>>  static int sev_flush_asids(int min_asid, int max_asid)
>>>>>>  {
>>>>>> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>>>>  	if (ret)
>>>>>>  		goto e_free;
>>>>>>  
>>>>>> -	INIT_LIST_HEAD(&sev->regions_list);
>>>>>> -
>>>>>>  	return 0;
>>>>>>  
>>>>>>  e_free:
>>>>>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>>>>>  	src->handle = 0;
>>>>>>  	src->pages_locked = 0;
>>>>>>  	src->enc_context_owner = NULL;
>>>>>> -
>>>>>> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>>>>>>  }
>>>>>>  
>>>>>>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
>>>>>> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>>>>  int svm_register_enc_region(struct kvm *kvm,
>>>>>>  			    struct kvm_enc_region *range)
>>>>>>  {
>>>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>> -	struct enc_region *region;
>>>>>> -	int ret = 0;
>>>>>> -
>>>>>> -	if (!sev_guest(kvm))
>>>>>> -		return -ENOTTY;
>>>>>> -
>>>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>>>> -	if (is_mirroring_enc_context(kvm))
>>>>>> -		return -EINVAL;
>>>>>> -
>>>>>> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
>>>>>> -		return -EINVAL;
>>>>>> -
>>>>>> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>>>>>> -	if (!region)
>>>>>> -		return -ENOMEM;
>>>>>> -
>>>>>> -	mutex_lock(&kvm->lock);
>>>>>> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
>>>>>> -	if (IS_ERR(region->pages)) {
>>>>>> -		ret = PTR_ERR(region->pages);
>>>>>> -		mutex_unlock(&kvm->lock);
>>>>>> -		goto e_free;
>>>>>> -	}
>>>>>> -
>>>>>> -	region->uaddr = range->addr;
>>>>>> -	region->size = range->size;
>>>>>> -
>>>>>> -	list_add_tail(&region->list, &sev->regions_list);
>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>> -
>>>>>> -	/*
>>>>>> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
>>>>>> -	 * or vice versa for this memory range. Lets make sure caches are
>>>>>> -	 * flushed to ensure that guest data gets written into memory with
>>>>>> -	 * correct C-bit.
>>>>>> -	 */
>>>>>> -	sev_clflush_pages(region->pages, region->npages);
>>>>>> -
>>>>>> -	return ret;
>>>>>> -
>>>>>> -e_free:
>>>>>> -	kfree(region);
>>>>>> -	return ret;
>>>>>> -}
>>>>>> -
>>>>>> -static struct enc_region *
>>>>>> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
>>>>>> -{
>>>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>> -	struct list_head *head = &sev->regions_list;
>>>>>> -	struct enc_region *i;
>>>>>> -
>>>>>> -	list_for_each_entry(i, head, list) {
>>>>>> -		if (i->uaddr == range->addr &&
>>>>>> -		    i->size == range->size)
>>>>>> -			return i;
>>>>>> -	}
>>>>>> -
>>>>>> -	return NULL;
>>>>>> -}
>>>>>> -
>>>>>> -static void __unregister_enc_region_locked(struct kvm *kvm,
>>>>>> -					   struct enc_region *region)
>>>>>> -{
>>>>>> -	sev_unpin_memory(kvm, region->pages, region->npages);
>>>>>> -	list_del(&region->list);
>>>>>> -	kfree(region);
>>>>>> +	return 0;
>>>>>>  }
>>>>>>  
>>>>>>  int svm_unregister_enc_region(struct kvm *kvm,
>>>>>>  			      struct kvm_enc_region *range)
>>>>>>  {
>>>>>> -	struct enc_region *region;
>>>>>> -	int ret;
>>>>>> -
>>>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>>>> -	if (is_mirroring_enc_context(kvm))
>>>>>> -		return -EINVAL;
>>>>>> -
>>>>>> -	mutex_lock(&kvm->lock);
>>>>>> -
>>>>>> -	if (!sev_guest(kvm)) {
>>>>>> -		ret = -ENOTTY;
>>>>>> -		goto failed;
>>>>>> -	}
>>>>>> -
>>>>>> -	region = find_enc_region(kvm, range);
>>>>>> -	if (!region) {
>>>>>> -		ret = -EINVAL;
>>>>>> -		goto failed;
>>>>>> -	}
>>>>>> -
>>>>>> -	/*
>>>>>> -	 * Ensure that all guest tagged cache entries are flushed before
>>>>>> -	 * releasing the pages back to the system for use. CLFLUSH will
>>>>>> -	 * not do this, so issue a WBINVD.
>>>>>> -	 */
>>>>>> -	wbinvd_on_all_cpus();
>>>>>> -
>>>>>> -	__unregister_enc_region_locked(kvm, region);
>>>>>> -
>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>>  	return 0;
>>>>>> -
>>>>>> -failed:
>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>> -	return ret;
>>>>>>  }
>>>>>>  
>>>>>>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>>  	mirror_sev->fd = source_sev->fd;
>>>>>>  	mirror_sev->es_active = source_sev->es_active;
>>>>>>  	mirror_sev->handle = source_sev->handle;
>>>>>> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>>>>>>  	ret = 0;
>>>>>>  
>>>>>>  	/*
>>>>>> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>>  void sev_vm_destroy(struct kvm *kvm)
>>>>>>  {
>>>>>>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>> -	struct list_head *head = &sev->regions_list;
>>>>>> -	struct list_head *pos, *q;
>>>>>>  
>>>>>>  	WARN_ON(sev->num_mirrored_vms);
>>>>>>  
>>>>>> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>>>>>>  	 */
>>>>>>  	wbinvd_on_all_cpus();
>>>>>>  
>>>>>> -	/*
>>>>>> -	 * if userspace was terminated before unregistering the memory regions
>>>>>> -	 * then lets unpin all the registered memory.
>>>>>> -	 */
>>>>>> -	if (!list_empty(head)) {
>>>>>> -		list_for_each_safe(pos, q, head) {
>>>>>> -			__unregister_enc_region_locked(kvm,
>>>>>> -				list_entry(pos, struct enc_region, list));
>>>>>> -			cond_resched();
>>>>>> -		}
>>>>>> -	}
>>>>>> -
>>>>>>  	sev_unbind_asid(kvm, sev->handle);
>>>>>>  	sev_asid_free(sev);
>>>>>>  }
>>>>>> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>>>>>>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>>>>>>  }
>>>>>>  
>>>>>> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>>>>>> +		  kvm_pfn_t pfn)
>>>>>> +{
>>>>>> +	struct kvm_arch_memory_slot *aslot;
>>>>>> +	struct kvm_memory_slot *slot;
>>>>>> +	gfn_t rel_gfn, pin_pfn;
>>>>>> +	unsigned long npages;
>>>>>> +	kvm_pfn_t old_pfn;
>>>>>> +	int i;
>>>>>> +
>>>>>> +	if (!sev_guest(kvm))
>>>>>> +		return;
>>>>>> +
>>>>>> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
>>>>>> +		return;
>>>>>> +
>>>>>> +	/* Tested till 1GB pages */
>>>>>> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
>>>>>> +		return;
>>>>>> +
>>>>>> +	slot = gfn_to_memslot(kvm, gfn);
>>>>>> +	if (!slot || !slot->arch.pfns)
>>>>>> +		return;
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * Use relative gfn index within the memslot for the bitmap as well as
>>>>>> +	 * the pfns array
>>>>>> +	 */
>>>>>> +	rel_gfn = gfn - slot->base_gfn;
>>>>>> +	aslot = &slot->arch;
>>>>>> +	pin_pfn = pfn;
>>>>>> +	npages = KVM_PAGES_PER_HPAGE(level);
>>>>>> +
>>>>>> +	/* Pin the page, KVM doesn't yet support page migration. */
>>>>>> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
>>>>>> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>>>>>> +			old_pfn = aslot->pfns[rel_gfn];
>>>>>> +			if (old_pfn == pin_pfn)
>>>>>> +				continue;
>>>>>> +
>>>>>> +			put_page(pfn_to_page(old_pfn));
>>>>>> +		}
>>>>>> +
>>>>>> +		set_bit(rel_gfn, aslot->pinned_bitmap);
>>>>>> +		aslot->pfns[rel_gfn] = pin_pfn;
>>>>>> +		get_page(pfn_to_page(pin_pfn));
>>>>>
>>>>>
>>>>> I assume this is to replace KVM_MEMORY_ENCRYPT_REG_REGION, which ends up
>>>>> calling svm_register_enc_region()->sev_pin_memory(), correct?
>>>>
>>>> Yes, that is correct.
>>>>>
>>>>> sev_pin_memory() correctly checks the RLIMIT_MEMLOCK and uses
>>>>> pin_user_pages_fast().
>>>>>
>>>>> I have to strongly assume that sev_pin_memory() is *wrong* as is because
>>>>> it's supposed to supply FOLL_LONGTERM -- after all we're pinning these
>>>>> pages possibly forever.
>>>>>
>>>>>
>>>>> I might be wrong but
>>>>>
>>>>> 1. You are missing the RLIMIT_MEMLOCK check
>>>>
>>>> Yes, I will add this check during the enc_region registration.
>>>>
>>>>> 2. get_page() is the wong way of long-term pinning a page. You would
>>>>> have to mimic what pin_user_pages_fast(FOLL_LONGTERM) does to eventually
>>>>> get it right (e.g., migrate the page off of MIGRATE_CMA or ZONE_MOVABLE).
>>>>
>>>> Let me go through this and I will come back. Thanks for pointing this out.
>>>
>>> I asusme the "issue" is that KVM uses mmu notifier and does a simple
>>> get_user_pages() to obtain the references, to drop the reference when
>>> the entry is invalidated via a mmu notifier call. So once you intent to
>>> long-term pin, it's already to late.
>>>
>>> If you could teach KVM to do a long-term pin when stumbling over these
>>> special encrypted memory regions (requires a proper matching
>>> unpin_user_pages() call from KVM), then you could "take over" that pin
>>> by get_page(), and let KVM do the ordinary put_page(), while you would
>>> do the unpin_user_pages().
>>>
>>
>> The fault path looks like this in KVM x86 mmu code:
>>
>> direct_page_fault()
>> -> kvm_faultin_pfn()
>>    -> __gfn_to_pfn_memslot()
>>       -> hva_to_pfn()
>>          -> hva_to_pfn_{slow,fast}()
>>             -> get_user_pages_*()      <<<<==== This is where the
>>                                                 reference is taken
>>
>> Next step is to create the mappings which is done in below functions:
>>
>> -> kvm_tdp_mmu_map() / __direct_map()
>>
>>    -> Within this function (patch 1/6), I call sev_pin_spte to take an extra 
>>       reference to pin it using get_page. 
>>
>>       Is it possible to use pin_user_pages(FOLL_LONGTERM) here? Wouldn't that 
>>       be equivalent to "take over" solution that you are suggesting?
>>
> 
> The issue is that pin_user_pages(FOLL_LONGTERM) might have to migrate
> the page, which will fail if there is already an additional reference
> from get_user_pages_*().
> 

Minor addition: hva_to_pfn_{slow,fast}() *don't* take a reference,
because we neither supply FOLL_GET nor FOLL_PIN. GUP users that rely on
memory notifiers don't require refernces.

I don't know what the implications would be if you FOLL_PIN |
FOLL_LONGTERM after already having a reference via
hva_to_pfn_{slow,fast}() in your hand in the callpath. Migration code
would effectively want to unmap the old page and call mmu notifiers to
properly invalidate the KVM MMU ...

In an ideal word, you'd really do a FOLL_PIN | FOLL_LONGTERM right away,
not doing the  get_user_pages_*()  first.
Nikunj A. Dadhania Jan. 31, 2022, 12:18 p.m. UTC | #11
On 1/31/2022 5:26 PM, David Hildenbrand wrote:
> On 28.01.22 12:08, David Hildenbrand wrote:
>> On 28.01.22 12:04, Nikunj A. Dadhania wrote:
>>> On 1/28/2022 1:57 PM, David Hildenbrand wrote:
>>>> On 28.01.22 07:57, Nikunj A. Dadhania wrote:
>>>>> On 1/26/2022 4:16 PM, David Hildenbrand wrote:
>>>>>> On 18.01.22 12:06, Nikunj A Dadhania wrote:
>>>>>>> Use the memslot metadata to store the pinned data along with the pfns.
>>>>>>> This improves the SEV guest startup time from O(n) to a constant by
>>>>>>> deferring guest page pinning until the pages are used to satisfy nested
>>>>>>> page faults. The page reference will be dropped in the memslot free
>>>>>>> path.
>>>>>>>
>>>>>>> Remove the enc_region structure definition and the code which did
>>>>>>> upfront pinning, as they are no longer needed in view of the demand
>>>>>>> pinning support.
>>>>>>>
>>>>>>> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
>>>>>>> since qemu is dependent on this API.
>>>>>>>
>>>>>>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>>>>>>> ---
>>>>>>>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>>>>>>>  arch/x86/kvm/svm/svm.c |   1 +
>>>>>>>  arch/x86/kvm/svm/svm.h |   3 +-
>>>>>>>  3 files changed, 81 insertions(+), 131 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>>>>>>> index d972ab4956d4..a962bed97a0b 100644
>>>>>>> --- a/arch/x86/kvm/svm/sev.c
>>>>>>> +++ b/arch/x86/kvm/svm/sev.c
>>>>>>> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>>>>>>>  static unsigned long *sev_asid_bitmap;
>>>>>>>  static unsigned long *sev_reclaim_asid_bitmap;
>>>>>>>  
>>>>>>> -struct enc_region {
>>>>>>> -	struct list_head list;
>>>>>>> -	unsigned long npages;
>>>>>>> -	struct page **pages;
>>>>>>> -	unsigned long uaddr;
>>>>>>> -	unsigned long size;
>>>>>>> -};
>>>>>>> -
>>>>>>>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>>>>>>>  static int sev_flush_asids(int min_asid, int max_asid)
>>>>>>>  {
>>>>>>> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>>>>>  	if (ret)
>>>>>>>  		goto e_free;
>>>>>>>  
>>>>>>> -	INIT_LIST_HEAD(&sev->regions_list);
>>>>>>> -
>>>>>>>  	return 0;
>>>>>>>  
>>>>>>>  e_free:
>>>>>>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>>>>>>  	src->handle = 0;
>>>>>>>  	src->pages_locked = 0;
>>>>>>>  	src->enc_context_owner = NULL;
>>>>>>> -
>>>>>>> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>>>>>>>  }
>>>>>>>  
>>>>>>>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
>>>>>>> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>>>>>  int svm_register_enc_region(struct kvm *kvm,
>>>>>>>  			    struct kvm_enc_region *range)
>>>>>>>  {
>>>>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>>> -	struct enc_region *region;
>>>>>>> -	int ret = 0;
>>>>>>> -
>>>>>>> -	if (!sev_guest(kvm))
>>>>>>> -		return -ENOTTY;
>>>>>>> -
>>>>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>>>>> -	if (is_mirroring_enc_context(kvm))
>>>>>>> -		return -EINVAL;
>>>>>>> -
>>>>>>> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
>>>>>>> -		return -EINVAL;
>>>>>>> -
>>>>>>> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>>>>>>> -	if (!region)
>>>>>>> -		return -ENOMEM;
>>>>>>> -
>>>>>>> -	mutex_lock(&kvm->lock);
>>>>>>> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
>>>>>>> -	if (IS_ERR(region->pages)) {
>>>>>>> -		ret = PTR_ERR(region->pages);
>>>>>>> -		mutex_unlock(&kvm->lock);
>>>>>>> -		goto e_free;
>>>>>>> -	}
>>>>>>> -
>>>>>>> -	region->uaddr = range->addr;
>>>>>>> -	region->size = range->size;
>>>>>>> -
>>>>>>> -	list_add_tail(&region->list, &sev->regions_list);
>>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>>> -
>>>>>>> -	/*
>>>>>>> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
>>>>>>> -	 * or vice versa for this memory range. Lets make sure caches are
>>>>>>> -	 * flushed to ensure that guest data gets written into memory with
>>>>>>> -	 * correct C-bit.
>>>>>>> -	 */
>>>>>>> -	sev_clflush_pages(region->pages, region->npages);
>>>>>>> -
>>>>>>> -	return ret;
>>>>>>> -
>>>>>>> -e_free:
>>>>>>> -	kfree(region);
>>>>>>> -	return ret;
>>>>>>> -}
>>>>>>> -
>>>>>>> -static struct enc_region *
>>>>>>> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
>>>>>>> -{
>>>>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>>> -	struct list_head *head = &sev->regions_list;
>>>>>>> -	struct enc_region *i;
>>>>>>> -
>>>>>>> -	list_for_each_entry(i, head, list) {
>>>>>>> -		if (i->uaddr == range->addr &&
>>>>>>> -		    i->size == range->size)
>>>>>>> -			return i;
>>>>>>> -	}
>>>>>>> -
>>>>>>> -	return NULL;
>>>>>>> -}
>>>>>>> -
>>>>>>> -static void __unregister_enc_region_locked(struct kvm *kvm,
>>>>>>> -					   struct enc_region *region)
>>>>>>> -{
>>>>>>> -	sev_unpin_memory(kvm, region->pages, region->npages);
>>>>>>> -	list_del(&region->list);
>>>>>>> -	kfree(region);
>>>>>>> +	return 0;
>>>>>>>  }
>>>>>>>  
>>>>>>>  int svm_unregister_enc_region(struct kvm *kvm,
>>>>>>>  			      struct kvm_enc_region *range)
>>>>>>>  {
>>>>>>> -	struct enc_region *region;
>>>>>>> -	int ret;
>>>>>>> -
>>>>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>>>>> -	if (is_mirroring_enc_context(kvm))
>>>>>>> -		return -EINVAL;
>>>>>>> -
>>>>>>> -	mutex_lock(&kvm->lock);
>>>>>>> -
>>>>>>> -	if (!sev_guest(kvm)) {
>>>>>>> -		ret = -ENOTTY;
>>>>>>> -		goto failed;
>>>>>>> -	}
>>>>>>> -
>>>>>>> -	region = find_enc_region(kvm, range);
>>>>>>> -	if (!region) {
>>>>>>> -		ret = -EINVAL;
>>>>>>> -		goto failed;
>>>>>>> -	}
>>>>>>> -
>>>>>>> -	/*
>>>>>>> -	 * Ensure that all guest tagged cache entries are flushed before
>>>>>>> -	 * releasing the pages back to the system for use. CLFLUSH will
>>>>>>> -	 * not do this, so issue a WBINVD.
>>>>>>> -	 */
>>>>>>> -	wbinvd_on_all_cpus();
>>>>>>> -
>>>>>>> -	__unregister_enc_region_locked(kvm, region);
>>>>>>> -
>>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>>>  	return 0;
>>>>>>> -
>>>>>>> -failed:
>>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>>> -	return ret;
>>>>>>>  }
>>>>>>>  
>>>>>>>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>>> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>>>  	mirror_sev->fd = source_sev->fd;
>>>>>>>  	mirror_sev->es_active = source_sev->es_active;
>>>>>>>  	mirror_sev->handle = source_sev->handle;
>>>>>>> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>>>>>>>  	ret = 0;
>>>>>>>  
>>>>>>>  	/*
>>>>>>> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>>>  void sev_vm_destroy(struct kvm *kvm)
>>>>>>>  {
>>>>>>>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>>> -	struct list_head *head = &sev->regions_list;
>>>>>>> -	struct list_head *pos, *q;
>>>>>>>  
>>>>>>>  	WARN_ON(sev->num_mirrored_vms);
>>>>>>>  
>>>>>>> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>>>>>>>  	 */
>>>>>>>  	wbinvd_on_all_cpus();
>>>>>>>  
>>>>>>> -	/*
>>>>>>> -	 * if userspace was terminated before unregistering the memory regions
>>>>>>> -	 * then lets unpin all the registered memory.
>>>>>>> -	 */
>>>>>>> -	if (!list_empty(head)) {
>>>>>>> -		list_for_each_safe(pos, q, head) {
>>>>>>> -			__unregister_enc_region_locked(kvm,
>>>>>>> -				list_entry(pos, struct enc_region, list));
>>>>>>> -			cond_resched();
>>>>>>> -		}
>>>>>>> -	}
>>>>>>> -
>>>>>>>  	sev_unbind_asid(kvm, sev->handle);
>>>>>>>  	sev_asid_free(sev);
>>>>>>>  }
>>>>>>> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>>>>>>>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>>>>>>>  }
>>>>>>>  
>>>>>>> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>>>>>>> +		  kvm_pfn_t pfn)
>>>>>>> +{
>>>>>>> +	struct kvm_arch_memory_slot *aslot;
>>>>>>> +	struct kvm_memory_slot *slot;
>>>>>>> +	gfn_t rel_gfn, pin_pfn;
>>>>>>> +	unsigned long npages;
>>>>>>> +	kvm_pfn_t old_pfn;
>>>>>>> +	int i;
>>>>>>> +
>>>>>>> +	if (!sev_guest(kvm))
>>>>>>> +		return;
>>>>>>> +
>>>>>>> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
>>>>>>> +		return;
>>>>>>> +
>>>>>>> +	/* Tested till 1GB pages */
>>>>>>> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
>>>>>>> +		return;
>>>>>>> +
>>>>>>> +	slot = gfn_to_memslot(kvm, gfn);
>>>>>>> +	if (!slot || !slot->arch.pfns)
>>>>>>> +		return;
>>>>>>> +
>>>>>>> +	/*
>>>>>>> +	 * Use relative gfn index within the memslot for the bitmap as well as
>>>>>>> +	 * the pfns array
>>>>>>> +	 */
>>>>>>> +	rel_gfn = gfn - slot->base_gfn;
>>>>>>> +	aslot = &slot->arch;
>>>>>>> +	pin_pfn = pfn;
>>>>>>> +	npages = KVM_PAGES_PER_HPAGE(level);
>>>>>>> +
>>>>>>> +	/* Pin the page, KVM doesn't yet support page migration. */
>>>>>>> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
>>>>>>> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>>>>>>> +			old_pfn = aslot->pfns[rel_gfn];
>>>>>>> +			if (old_pfn == pin_pfn)
>>>>>>> +				continue;
>>>>>>> +
>>>>>>> +			put_page(pfn_to_page(old_pfn));
>>>>>>> +		}
>>>>>>> +
>>>>>>> +		set_bit(rel_gfn, aslot->pinned_bitmap);
>>>>>>> +		aslot->pfns[rel_gfn] = pin_pfn;
>>>>>>> +		get_page(pfn_to_page(pin_pfn));
>>>>>>
>>>>>>
>>>>>> I assume this is to replace KVM_MEMORY_ENCRYPT_REG_REGION, which ends up
>>>>>> calling svm_register_enc_region()->sev_pin_memory(), correct?
>>>>>
>>>>> Yes, that is correct.
>>>>>>
>>>>>> sev_pin_memory() correctly checks the RLIMIT_MEMLOCK and uses
>>>>>> pin_user_pages_fast().
>>>>>>
>>>>>> I have to strongly assume that sev_pin_memory() is *wrong* as is because
>>>>>> it's supposed to supply FOLL_LONGTERM -- after all we're pinning these
>>>>>> pages possibly forever.
>>>>>>
>>>>>>
>>>>>> I might be wrong but
>>>>>>
>>>>>> 1. You are missing the RLIMIT_MEMLOCK check
>>>>>
>>>>> Yes, I will add this check during the enc_region registration.
>>>>>
>>>>>> 2. get_page() is the wong way of long-term pinning a page. You would
>>>>>> have to mimic what pin_user_pages_fast(FOLL_LONGTERM) does to eventually
>>>>>> get it right (e.g., migrate the page off of MIGRATE_CMA or ZONE_MOVABLE).
>>>>>
>>>>> Let me go through this and I will come back. Thanks for pointing this out.
>>>>
>>>> I asusme the "issue" is that KVM uses mmu notifier and does a simple
>>>> get_user_pages() to obtain the references, to drop the reference when
>>>> the entry is invalidated via a mmu notifier call. So once you intent to
>>>> long-term pin, it's already to late.
>>>>
>>>> If you could teach KVM to do a long-term pin when stumbling over these
>>>> special encrypted memory regions (requires a proper matching
>>>> unpin_user_pages() call from KVM), then you could "take over" that pin
>>>> by get_page(), and let KVM do the ordinary put_page(), while you would
>>>> do the unpin_user_pages().
>>>>
>>>
>>> The fault path looks like this in KVM x86 mmu code:
>>>
>>> direct_page_fault()
>>> -> kvm_faultin_pfn()
>>>    -> __gfn_to_pfn_memslot()
>>>       -> hva_to_pfn()
>>>          -> hva_to_pfn_{slow,fast}()
>>>             -> get_user_pages_*()      <<<<==== This is where the
>>>                                                 reference is taken
>>>
>>> Next step is to create the mappings which is done in below functions:
>>>
>>> -> kvm_tdp_mmu_map() / __direct_map()
>>>
>>>    -> Within this function (patch 1/6), I call sev_pin_spte to take an extra 
>>>       reference to pin it using get_page. 
>>>
>>>       Is it possible to use pin_user_pages(FOLL_LONGTERM) here? Wouldn't that 
>>>       be equivalent to "take over" solution that you are suggesting?
>>>
>>
>> The issue is that pin_user_pages(FOLL_LONGTERM) might have to migrate
>> the page, which will fail if there is already an additional reference
>> from get_user_pages_*().
>>
> 
> Minor addition: hva_to_pfn_{slow,fast}() *don't* take a reference,

hva_to_pfn_fast() does take a reference, not able to find in _slow() though.

->get_user_page_fast_only()
  -> get_user_pages_fast_only()
     ...
     gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
     ...

> because we neither supply FOLL_GET nor FOLL_PIN. GUP users that rely on
> memory notifiers don't require refernces.
> 
> I don't know what the implications would be if you FOLL_PIN |
> FOLL_LONGTERM after already having a reference via
> hva_to_pfn_{slow,fast}() in your hand in the callpath. Migration code
> would effectively want to unmap the old page and call mmu notifiers to
> properly invalidate the KVM MMU ...
> 
> In an ideal word, you'd really do a FOLL_PIN | FOLL_LONGTERM right away,
> not doing the  get_user_pages_*()  first.
> 

I am thinking on the same line:

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index eff3ef64722b..fd7c878ab03d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2379,9 +2379,10 @@ static inline int check_user_page_hwpoison(unsigned long addr)
  * only part that runs if we can in atomic context.
  */
 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
-                           bool *writable, kvm_pfn_t *pfn)
+                           bool *writable, kvm_pfn_t *pfn, bool pin_longterm)
 {
        struct page *page[1];
+       bool ret;

        /*
         * Fast pin a writable pfn only if it is a write fault request
@@ -2391,7 +2392,12 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
        if (!(write_fault || writable))
                return false;

-       if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
+       if (!pin_longterm)
+               ret = get_user_page_fast_only(addr, FOLL_WRITE, page);
+       else
+               ret = pin_user_pages_fast(addr, 1, FOLL_WRITE | FOLL_LONGTERM, page);
+
+       if (ret) {
                *pfn = page_to_pfn(page[0]);


And the pin_longterm could be determined using a memslot flags:

#define KVM_MEMSLOT_LONGTERM    (1UL << 17)

Regards
Nikunj
David Hildenbrand Jan. 31, 2022, 12:41 p.m. UTC | #12
On 31.01.22 13:18, Nikunj A. Dadhania wrote:
> On 1/31/2022 5:26 PM, David Hildenbrand wrote:
>> On 28.01.22 12:08, David Hildenbrand wrote:
>>> On 28.01.22 12:04, Nikunj A. Dadhania wrote:
>>>> On 1/28/2022 1:57 PM, David Hildenbrand wrote:
>>>>> On 28.01.22 07:57, Nikunj A. Dadhania wrote:
>>>>>> On 1/26/2022 4:16 PM, David Hildenbrand wrote:
>>>>>>> On 18.01.22 12:06, Nikunj A Dadhania wrote:
>>>>>>>> Use the memslot metadata to store the pinned data along with the pfns.
>>>>>>>> This improves the SEV guest startup time from O(n) to a constant by
>>>>>>>> deferring guest page pinning until the pages are used to satisfy nested
>>>>>>>> page faults. The page reference will be dropped in the memslot free
>>>>>>>> path.
>>>>>>>>
>>>>>>>> Remove the enc_region structure definition and the code which did
>>>>>>>> upfront pinning, as they are no longer needed in view of the demand
>>>>>>>> pinning support.
>>>>>>>>
>>>>>>>> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
>>>>>>>> since qemu is dependent on this API.
>>>>>>>>
>>>>>>>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>>>>>>>> ---
>>>>>>>>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>>>>>>>>  arch/x86/kvm/svm/svm.c |   1 +
>>>>>>>>  arch/x86/kvm/svm/svm.h |   3 +-
>>>>>>>>  3 files changed, 81 insertions(+), 131 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>>>>>>>> index d972ab4956d4..a962bed97a0b 100644
>>>>>>>> --- a/arch/x86/kvm/svm/sev.c
>>>>>>>> +++ b/arch/x86/kvm/svm/sev.c
>>>>>>>> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>>>>>>>>  static unsigned long *sev_asid_bitmap;
>>>>>>>>  static unsigned long *sev_reclaim_asid_bitmap;
>>>>>>>>  
>>>>>>>> -struct enc_region {
>>>>>>>> -	struct list_head list;
>>>>>>>> -	unsigned long npages;
>>>>>>>> -	struct page **pages;
>>>>>>>> -	unsigned long uaddr;
>>>>>>>> -	unsigned long size;
>>>>>>>> -};
>>>>>>>> -
>>>>>>>>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>>>>>>>>  static int sev_flush_asids(int min_asid, int max_asid)
>>>>>>>>  {
>>>>>>>> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>>>>>>>>  	if (ret)
>>>>>>>>  		goto e_free;
>>>>>>>>  
>>>>>>>> -	INIT_LIST_HEAD(&sev->regions_list);
>>>>>>>> -
>>>>>>>>  	return 0;
>>>>>>>>  
>>>>>>>>  e_free:
>>>>>>>> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>>>>>>>>  	src->handle = 0;
>>>>>>>>  	src->pages_locked = 0;
>>>>>>>>  	src->enc_context_owner = NULL;
>>>>>>>> -
>>>>>>>> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
>>>>>>>> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>>>>>>>>  int svm_register_enc_region(struct kvm *kvm,
>>>>>>>>  			    struct kvm_enc_region *range)
>>>>>>>>  {
>>>>>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>>>> -	struct enc_region *region;
>>>>>>>> -	int ret = 0;
>>>>>>>> -
>>>>>>>> -	if (!sev_guest(kvm))
>>>>>>>> -		return -ENOTTY;
>>>>>>>> -
>>>>>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>>>>>> -	if (is_mirroring_enc_context(kvm))
>>>>>>>> -		return -EINVAL;
>>>>>>>> -
>>>>>>>> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
>>>>>>>> -		return -EINVAL;
>>>>>>>> -
>>>>>>>> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>>>>>>>> -	if (!region)
>>>>>>>> -		return -ENOMEM;
>>>>>>>> -
>>>>>>>> -	mutex_lock(&kvm->lock);
>>>>>>>> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
>>>>>>>> -	if (IS_ERR(region->pages)) {
>>>>>>>> -		ret = PTR_ERR(region->pages);
>>>>>>>> -		mutex_unlock(&kvm->lock);
>>>>>>>> -		goto e_free;
>>>>>>>> -	}
>>>>>>>> -
>>>>>>>> -	region->uaddr = range->addr;
>>>>>>>> -	region->size = range->size;
>>>>>>>> -
>>>>>>>> -	list_add_tail(&region->list, &sev->regions_list);
>>>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>>>> -
>>>>>>>> -	/*
>>>>>>>> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
>>>>>>>> -	 * or vice versa for this memory range. Lets make sure caches are
>>>>>>>> -	 * flushed to ensure that guest data gets written into memory with
>>>>>>>> -	 * correct C-bit.
>>>>>>>> -	 */
>>>>>>>> -	sev_clflush_pages(region->pages, region->npages);
>>>>>>>> -
>>>>>>>> -	return ret;
>>>>>>>> -
>>>>>>>> -e_free:
>>>>>>>> -	kfree(region);
>>>>>>>> -	return ret;
>>>>>>>> -}
>>>>>>>> -
>>>>>>>> -static struct enc_region *
>>>>>>>> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
>>>>>>>> -{
>>>>>>>> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>>>> -	struct list_head *head = &sev->regions_list;
>>>>>>>> -	struct enc_region *i;
>>>>>>>> -
>>>>>>>> -	list_for_each_entry(i, head, list) {
>>>>>>>> -		if (i->uaddr == range->addr &&
>>>>>>>> -		    i->size == range->size)
>>>>>>>> -			return i;
>>>>>>>> -	}
>>>>>>>> -
>>>>>>>> -	return NULL;
>>>>>>>> -}
>>>>>>>> -
>>>>>>>> -static void __unregister_enc_region_locked(struct kvm *kvm,
>>>>>>>> -					   struct enc_region *region)
>>>>>>>> -{
>>>>>>>> -	sev_unpin_memory(kvm, region->pages, region->npages);
>>>>>>>> -	list_del(&region->list);
>>>>>>>> -	kfree(region);
>>>>>>>> +	return 0;
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  int svm_unregister_enc_region(struct kvm *kvm,
>>>>>>>>  			      struct kvm_enc_region *range)
>>>>>>>>  {
>>>>>>>> -	struct enc_region *region;
>>>>>>>> -	int ret;
>>>>>>>> -
>>>>>>>> -	/* If kvm is mirroring encryption context it isn't responsible for it */
>>>>>>>> -	if (is_mirroring_enc_context(kvm))
>>>>>>>> -		return -EINVAL;
>>>>>>>> -
>>>>>>>> -	mutex_lock(&kvm->lock);
>>>>>>>> -
>>>>>>>> -	if (!sev_guest(kvm)) {
>>>>>>>> -		ret = -ENOTTY;
>>>>>>>> -		goto failed;
>>>>>>>> -	}
>>>>>>>> -
>>>>>>>> -	region = find_enc_region(kvm, range);
>>>>>>>> -	if (!region) {
>>>>>>>> -		ret = -EINVAL;
>>>>>>>> -		goto failed;
>>>>>>>> -	}
>>>>>>>> -
>>>>>>>> -	/*
>>>>>>>> -	 * Ensure that all guest tagged cache entries are flushed before
>>>>>>>> -	 * releasing the pages back to the system for use. CLFLUSH will
>>>>>>>> -	 * not do this, so issue a WBINVD.
>>>>>>>> -	 */
>>>>>>>> -	wbinvd_on_all_cpus();
>>>>>>>> -
>>>>>>>> -	__unregister_enc_region_locked(kvm, region);
>>>>>>>> -
>>>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>>>>  	return 0;
>>>>>>>> -
>>>>>>>> -failed:
>>>>>>>> -	mutex_unlock(&kvm->lock);
>>>>>>>> -	return ret;
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>>>> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>>>>  	mirror_sev->fd = source_sev->fd;
>>>>>>>>  	mirror_sev->es_active = source_sev->es_active;
>>>>>>>>  	mirror_sev->handle = source_sev->handle;
>>>>>>>> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>>>>>>>>  	ret = 0;
>>>>>>>>  
>>>>>>>>  	/*
>>>>>>>> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>>>>>>>>  void sev_vm_destroy(struct kvm *kvm)
>>>>>>>>  {
>>>>>>>>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>>>>>> -	struct list_head *head = &sev->regions_list;
>>>>>>>> -	struct list_head *pos, *q;
>>>>>>>>  
>>>>>>>>  	WARN_ON(sev->num_mirrored_vms);
>>>>>>>>  
>>>>>>>> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>>>>>>>>  	 */
>>>>>>>>  	wbinvd_on_all_cpus();
>>>>>>>>  
>>>>>>>> -	/*
>>>>>>>> -	 * if userspace was terminated before unregistering the memory regions
>>>>>>>> -	 * then lets unpin all the registered memory.
>>>>>>>> -	 */
>>>>>>>> -	if (!list_empty(head)) {
>>>>>>>> -		list_for_each_safe(pos, q, head) {
>>>>>>>> -			__unregister_enc_region_locked(kvm,
>>>>>>>> -				list_entry(pos, struct enc_region, list));
>>>>>>>> -			cond_resched();
>>>>>>>> -		}
>>>>>>>> -	}
>>>>>>>> -
>>>>>>>>  	sev_unbind_asid(kvm, sev->handle);
>>>>>>>>  	sev_asid_free(sev);
>>>>>>>>  }
>>>>>>>> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>>>>>>>>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
>>>>>>>> +		  kvm_pfn_t pfn)
>>>>>>>> +{
>>>>>>>> +	struct kvm_arch_memory_slot *aslot;
>>>>>>>> +	struct kvm_memory_slot *slot;
>>>>>>>> +	gfn_t rel_gfn, pin_pfn;
>>>>>>>> +	unsigned long npages;
>>>>>>>> +	kvm_pfn_t old_pfn;
>>>>>>>> +	int i;
>>>>>>>> +
>>>>>>>> +	if (!sev_guest(kvm))
>>>>>>>> +		return;
>>>>>>>> +
>>>>>>>> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
>>>>>>>> +		return;
>>>>>>>> +
>>>>>>>> +	/* Tested till 1GB pages */
>>>>>>>> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
>>>>>>>> +		return;
>>>>>>>> +
>>>>>>>> +	slot = gfn_to_memslot(kvm, gfn);
>>>>>>>> +	if (!slot || !slot->arch.pfns)
>>>>>>>> +		return;
>>>>>>>> +
>>>>>>>> +	/*
>>>>>>>> +	 * Use relative gfn index within the memslot for the bitmap as well as
>>>>>>>> +	 * the pfns array
>>>>>>>> +	 */
>>>>>>>> +	rel_gfn = gfn - slot->base_gfn;
>>>>>>>> +	aslot = &slot->arch;
>>>>>>>> +	pin_pfn = pfn;
>>>>>>>> +	npages = KVM_PAGES_PER_HPAGE(level);
>>>>>>>> +
>>>>>>>> +	/* Pin the page, KVM doesn't yet support page migration. */
>>>>>>>> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
>>>>>>>> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>>>>>>>> +			old_pfn = aslot->pfns[rel_gfn];
>>>>>>>> +			if (old_pfn == pin_pfn)
>>>>>>>> +				continue;
>>>>>>>> +
>>>>>>>> +			put_page(pfn_to_page(old_pfn));
>>>>>>>> +		}
>>>>>>>> +
>>>>>>>> +		set_bit(rel_gfn, aslot->pinned_bitmap);
>>>>>>>> +		aslot->pfns[rel_gfn] = pin_pfn;
>>>>>>>> +		get_page(pfn_to_page(pin_pfn));
>>>>>>>
>>>>>>>
>>>>>>> I assume this is to replace KVM_MEMORY_ENCRYPT_REG_REGION, which ends up
>>>>>>> calling svm_register_enc_region()->sev_pin_memory(), correct?
>>>>>>
>>>>>> Yes, that is correct.
>>>>>>>
>>>>>>> sev_pin_memory() correctly checks the RLIMIT_MEMLOCK and uses
>>>>>>> pin_user_pages_fast().
>>>>>>>
>>>>>>> I have to strongly assume that sev_pin_memory() is *wrong* as is because
>>>>>>> it's supposed to supply FOLL_LONGTERM -- after all we're pinning these
>>>>>>> pages possibly forever.
>>>>>>>
>>>>>>>
>>>>>>> I might be wrong but
>>>>>>>
>>>>>>> 1. You are missing the RLIMIT_MEMLOCK check
>>>>>>
>>>>>> Yes, I will add this check during the enc_region registration.
>>>>>>
>>>>>>> 2. get_page() is the wong way of long-term pinning a page. You would
>>>>>>> have to mimic what pin_user_pages_fast(FOLL_LONGTERM) does to eventually
>>>>>>> get it right (e.g., migrate the page off of MIGRATE_CMA or ZONE_MOVABLE).
>>>>>>
>>>>>> Let me go through this and I will come back. Thanks for pointing this out.
>>>>>
>>>>> I asusme the "issue" is that KVM uses mmu notifier and does a simple
>>>>> get_user_pages() to obtain the references, to drop the reference when
>>>>> the entry is invalidated via a mmu notifier call. So once you intent to
>>>>> long-term pin, it's already to late.
>>>>>
>>>>> If you could teach KVM to do a long-term pin when stumbling over these
>>>>> special encrypted memory regions (requires a proper matching
>>>>> unpin_user_pages() call from KVM), then you could "take over" that pin
>>>>> by get_page(), and let KVM do the ordinary put_page(), while you would
>>>>> do the unpin_user_pages().
>>>>>
>>>>
>>>> The fault path looks like this in KVM x86 mmu code:
>>>>
>>>> direct_page_fault()
>>>> -> kvm_faultin_pfn()
>>>>    -> __gfn_to_pfn_memslot()
>>>>       -> hva_to_pfn()
>>>>          -> hva_to_pfn_{slow,fast}()
>>>>             -> get_user_pages_*()      <<<<==== This is where the
>>>>                                                 reference is taken
>>>>
>>>> Next step is to create the mappings which is done in below functions:
>>>>
>>>> -> kvm_tdp_mmu_map() / __direct_map()
>>>>
>>>>    -> Within this function (patch 1/6), I call sev_pin_spte to take an extra 
>>>>       reference to pin it using get_page. 
>>>>
>>>>       Is it possible to use pin_user_pages(FOLL_LONGTERM) here? Wouldn't that 
>>>>       be equivalent to "take over" solution that you are suggesting?
>>>>
>>>
>>> The issue is that pin_user_pages(FOLL_LONGTERM) might have to migrate
>>> the page, which will fail if there is already an additional reference
>>> from get_user_pages_*().
>>>
>>
>> Minor addition: hva_to_pfn_{slow,fast}() *don't* take a reference,
> 
> hva_to_pfn_fast() does take a reference, not able to find in _slow() though.

Ah, my fault, you're correct and my memory is wrong.

> 
> ->get_user_page_fast_only()
>   -> get_user_pages_fast_only()
>      ...
>      gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
>      ...

__get_user_pages_locked() has

if (pages && !(flags & FOLL_PIN))
	flags |= FOLL_GET;$


I could have sworn we'd have code to lookup a page without the need to
grab a reference for MMU notifier purposes in KVM's MMU.

But looking into the details, I think we simply get a reference, map the
page, and then release the reference.
Mingwei Zhang March 6, 2022, 7:48 p.m. UTC | #13
On Tue, Jan 18, 2022, Nikunj A Dadhania wrote:
> Use the memslot metadata to store the pinned data along with the pfns.
> This improves the SEV guest startup time from O(n) to a constant by
> deferring guest page pinning until the pages are used to satisfy nested
> page faults. The page reference will be dropped in the memslot free
> path.
> 
> Remove the enc_region structure definition and the code which did
> upfront pinning, as they are no longer needed in view of the demand
> pinning support.
> 
> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
> since qemu is dependent on this API.
> 
> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
> ---
>  arch/x86/kvm/svm/sev.c | 208 ++++++++++++++++-------------------------
>  arch/x86/kvm/svm/svm.c |   1 +
>  arch/x86/kvm/svm/svm.h |   3 +-
>  3 files changed, 81 insertions(+), 131 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index d972ab4956d4..a962bed97a0b 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -66,14 +66,6 @@ static unsigned int nr_asids;
>  static unsigned long *sev_asid_bitmap;
>  static unsigned long *sev_reclaim_asid_bitmap;
>  
> -struct enc_region {
> -	struct list_head list;
> -	unsigned long npages;
> -	struct page **pages;
> -	unsigned long uaddr;
> -	unsigned long size;
> -};
> -
>  /* Called with the sev_bitmap_lock held, or on shutdown  */
>  static int sev_flush_asids(int min_asid, int max_asid)
>  {
> @@ -257,8 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>  	if (ret)
>  		goto e_free;
>  
> -	INIT_LIST_HEAD(&sev->regions_list);
> -
>  	return 0;
>  
>  e_free:
> @@ -1637,8 +1627,6 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>  	src->handle = 0;
>  	src->pages_locked = 0;
>  	src->enc_context_owner = NULL;
> -
> -	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>  }
>  
>  static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
> @@ -1861,115 +1849,13 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
>  int svm_register_enc_region(struct kvm *kvm,
>  			    struct kvm_enc_region *range)
>  {
> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -	struct enc_region *region;
> -	int ret = 0;
> -
> -	if (!sev_guest(kvm))
> -		return -ENOTTY;
> -
> -	/* If kvm is mirroring encryption context it isn't responsible for it */
> -	if (is_mirroring_enc_context(kvm))
> -		return -EINVAL;
> -
> -	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
> -		return -EINVAL;
> -
> -	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
> -	if (!region)
> -		return -ENOMEM;
> -
> -	mutex_lock(&kvm->lock);
> -	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
> -	if (IS_ERR(region->pages)) {
> -		ret = PTR_ERR(region->pages);
> -		mutex_unlock(&kvm->lock);
> -		goto e_free;
> -	}
> -
> -	region->uaddr = range->addr;
> -	region->size = range->size;
> -
> -	list_add_tail(&region->list, &sev->regions_list);
> -	mutex_unlock(&kvm->lock);
> -
> -	/*
> -	 * The guest may change the memory encryption attribute from C=0 -> C=1
> -	 * or vice versa for this memory range. Lets make sure caches are
> -	 * flushed to ensure that guest data gets written into memory with
> -	 * correct C-bit.
> -	 */
> -	sev_clflush_pages(region->pages, region->npages);
> -
> -	return ret;
> -
> -e_free:
> -	kfree(region);
> -	return ret;
> -}
> -
> -static struct enc_region *
> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
> -{
> -	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -	struct list_head *head = &sev->regions_list;
> -	struct enc_region *i;
> -
> -	list_for_each_entry(i, head, list) {
> -		if (i->uaddr == range->addr &&
> -		    i->size == range->size)
> -			return i;
> -	}
> -
> -	return NULL;
> -}
> -
> -static void __unregister_enc_region_locked(struct kvm *kvm,
> -					   struct enc_region *region)
> -{
> -	sev_unpin_memory(kvm, region->pages, region->npages);
> -	list_del(&region->list);
> -	kfree(region);
> +	return 0;
>  }
>  
>  int svm_unregister_enc_region(struct kvm *kvm,
>  			      struct kvm_enc_region *range)
>  {
> -	struct enc_region *region;
> -	int ret;
> -
> -	/* If kvm is mirroring encryption context it isn't responsible for it */
> -	if (is_mirroring_enc_context(kvm))
> -		return -EINVAL;
> -
> -	mutex_lock(&kvm->lock);
> -
> -	if (!sev_guest(kvm)) {
> -		ret = -ENOTTY;
> -		goto failed;
> -	}
> -
> -	region = find_enc_region(kvm, range);
> -	if (!region) {
> -		ret = -EINVAL;
> -		goto failed;
> -	}
> -
> -	/*
> -	 * Ensure that all guest tagged cache entries are flushed before
> -	 * releasing the pages back to the system for use. CLFLUSH will
> -	 * not do this, so issue a WBINVD.
> -	 */
> -	wbinvd_on_all_cpus();
> -
> -	__unregister_enc_region_locked(kvm, region);
> -
> -	mutex_unlock(&kvm->lock);
>  	return 0;
> -
> -failed:
> -	mutex_unlock(&kvm->lock);
> -	return ret;
>  }
>  
>  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> @@ -2018,7 +1904,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>  	mirror_sev->fd = source_sev->fd;
>  	mirror_sev->es_active = source_sev->es_active;
>  	mirror_sev->handle = source_sev->handle;
> -	INIT_LIST_HEAD(&mirror_sev->regions_list);
>  	ret = 0;
>  
>  	/*
> @@ -2038,8 +1923,6 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>  void sev_vm_destroy(struct kvm *kvm)
>  {
>  	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> -	struct list_head *head = &sev->regions_list;
> -	struct list_head *pos, *q;
>  
>  	WARN_ON(sev->num_mirrored_vms);
>  
> @@ -2066,18 +1949,6 @@ void sev_vm_destroy(struct kvm *kvm)
>  	 */
>  	wbinvd_on_all_cpus();
>  
> -	/*
> -	 * if userspace was terminated before unregistering the memory regions
> -	 * then lets unpin all the registered memory.
> -	 */
> -	if (!list_empty(head)) {
> -		list_for_each_safe(pos, q, head) {
> -			__unregister_enc_region_locked(kvm,
> -				list_entry(pos, struct enc_region, list));
> -			cond_resched();
> -		}
> -	}
> -
>  	sev_unbind_asid(kvm, sev->handle);
>  	sev_asid_free(sev);
>  }
> @@ -2946,13 +2817,90 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>  	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>  }
>  
> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +		  kvm_pfn_t pfn)
> +{
> +	struct kvm_arch_memory_slot *aslot;
> +	struct kvm_memory_slot *slot;
> +	gfn_t rel_gfn, pin_pfn;
> +	unsigned long npages;
> +	kvm_pfn_t old_pfn;
> +	int i;
> +
> +	if (!sev_guest(kvm))
> +		return;
> +
> +	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
> +		return;
> +
> +	/* Tested till 1GB pages */
> +	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
> +		return;
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (!slot || !slot->arch.pfns)
> +		return;
> +
> +	/*
> +	 * Use relative gfn index within the memslot for the bitmap as well as
> +	 * the pfns array
> +	 */
> +	rel_gfn = gfn - slot->base_gfn;
> +	aslot = &slot->arch;
> +	pin_pfn = pfn;
> +	npages = KVM_PAGES_PER_HPAGE(level);
> +
> +	/* Pin the page, KVM doesn't yet support page migration. */
> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
> +			old_pfn = aslot->pfns[rel_gfn];
> +			if (old_pfn == pin_pfn)
> +				continue;
> +
> +			put_page(pfn_to_page(old_pfn));

You need to flush the old pfn using VMPAGE_FLUSH before doing put_page.
Normally, this should not happen. But if the user-level VMM is
malicious, then it could just munmap() the region (not the memslot);
mmap() it again; let the guest VM touches the page and you will see this
path get executed.

Clearly, this will slow down the faulting path if this happens.  So,
alternatively, you can register a hook in mmu_notifier and shoot a flush
there according to the bitmap. Either way should work.

> +		}
> +
> +		set_bit(rel_gfn, aslot->pinned_bitmap);
> +		aslot->pfns[rel_gfn] = pin_pfn;
> +		get_page(pfn_to_page(pin_pfn));
> +	}
> +
> +	/*
> +	 * Flush any cached lines of the page being added since "ownership" of
> +	 * it will be transferred from the host to an encrypted guest.
> +	 */
> +	clflush_cache_range(__va(pfn << PAGE_SHIFT), page_level_size(level));
> +}
> +
>  void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
>  {
>  	struct kvm_arch_memory_slot *aslot = &slot->arch;
> +	kvm_pfn_t *pfns;
> +	gfn_t gfn;
> +	int i;
>  
>  	if (!sev_guest(kvm))
>  		return;
>  
> +	if (!aslot->pinned_bitmap || !slot->arch.pfns)
> +		goto out;
> +
> +	pfns = aslot->pfns;
> +
> +	/*
> +	 * Iterate the memslot to find the pinned pfn using the bitmap and drop
> +	 * the pfn stored.
> +	 */
> +	for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) {
> +		if (test_and_clear_bit(i, aslot->pinned_bitmap)) {
> +			if (WARN_ON(!pfns[i]))
> +				continue;
> +
> +			put_page(pfn_to_page(pfns[i]));

Here, you get lucky that you don't have to flush the cache. However,
this is because sev_free_memslots is called after the
kvm_arch_destroy_vm, which flushes the cache system wise.
> +		}
> +	}
> +
> +out:
>  	if (aslot->pinned_bitmap) {
>  		kvfree(aslot->pinned_bitmap);
>  		aslot->pinned_bitmap = NULL;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 3fb19974f719..22535c680b3f 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4743,6 +4743,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  
>  	.alloc_memslot_metadata = sev_alloc_memslot_metadata,
>  	.free_memslot = sev_free_memslot,
> +	.pin_spte = sev_pin_spte,
>  };
>  
>  static struct kvm_x86_init_ops svm_init_ops __initdata = {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index b2f8b3b52680..c731bc91ea8f 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -77,7 +77,6 @@ struct kvm_sev_info {
>  	unsigned int handle;	/* SEV firmware handle */
>  	int fd;			/* SEV device fd */
>  	unsigned long pages_locked; /* Number of pages locked */
> -	struct list_head regions_list;  /* List of registered regions */
>  	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
>  	struct kvm *enc_context_owner; /* Owner of copied encryption context */
>  	unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
> @@ -648,5 +647,7 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
>  			       struct kvm_memory_slot *new);
>  void sev_free_memslot(struct kvm *kvm,
>  		      struct kvm_memory_slot *slot);
> +void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> +		  kvm_pfn_t pfn);
>  
>  #endif
> -- 
> 2.32.0
>
Nikunj A. Dadhania March 7, 2022, 7:08 a.m. UTC | #14
On 3/7/2022 1:18 AM, Mingwei Zhang wrote:
> On Tue, Jan 18, 2022, Nikunj A Dadhania wrote:
>> Use the memslot metadata to store the pinned data along with the pfns.
>> This improves the SEV guest startup time from O(n) to a constant by
>> deferring guest page pinning until the pages are used to satisfy nested
>> page faults. The page reference will be dropped in the memslot free
>> path.
>>
>> Remove the enc_region structure definition and the code which did
>> upfront pinning, as they are no longer needed in view of the demand
>> pinning support.
>>
>> Leave svm_register_enc_region() and svm_unregister_enc_region() as stubs
>> since qemu is dependent on this API.
>>
>> Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
>> ---

>> +
>> +	/* Pin the page, KVM doesn't yet support page migration. */
>> +	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
>> +		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>> +			old_pfn = aslot->pfns[rel_gfn];
>> +			if (old_pfn == pin_pfn)
>> +				continue;
>> +
>> +			put_page(pfn_to_page(old_pfn));
> 
> You need to flush the old pfn using VMPAGE_FLUSH before doing put_page.
> Normally, this should not happen. But if the user-level VMM is
> malicious, then it could just munmap() the region (not the memslot);
> mmap() it again; let the guest VM touches the page and you will see this
> path get executed.
> 
> Clearly, this will slow down the faulting path if this happens.  So,
> alternatively, you can register a hook in mmu_notifier and shoot a flush
> there according to the bitmap. Either way should work.
>

We can call sev_flush_guest_memory() before the put_page().

>> +		}
>> +
>> +		set_bit(rel_gfn, aslot->pinned_bitmap);
>> +		aslot->pfns[rel_gfn] = pin_pfn;
>> +		get_page(pfn_to_page(pin_pfn));
>> +	}
>> +
>> +	/*
>> +	 * Flush any cached lines of the page being added since "ownership" of
>> +	 * it will be transferred from the host to an encrypted guest.
>> +	 */
>> +	clflush_cache_range(__va(pfn << PAGE_SHIFT), page_level_size(level));
>> +}
>> +
>>  void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
>>  {
>>  	struct kvm_arch_memory_slot *aslot = &slot->arch;
>> +	kvm_pfn_t *pfns;
>> +	gfn_t gfn;
>> +	int i;
>>  
>>  	if (!sev_guest(kvm))
>>  		return;
>>  
>> +	if (!aslot->pinned_bitmap || !slot->arch.pfns)
>> +		goto out;
>> +
>> +	pfns = aslot->pfns;
>> +
>> +	/*
>> +	 * Iterate the memslot to find the pinned pfn using the bitmap and drop
>> +	 * the pfn stored.
>> +	 */
>> +	for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) {
>> +		if (test_and_clear_bit(i, aslot->pinned_bitmap)) {
>> +			if (WARN_ON(!pfns[i]))
>> +				continue;
>> +
>> +			put_page(pfn_to_page(pfns[i]));
> 
> Here, you get lucky that you don't have to flush the cache. However,
> this is because sev_free_memslots is called after the
> kvm_arch_destroy_vm, which flushes the cache system wise.

I have added wbinvd_on_all_cpus() just before the iteration in my new version.

Regards
Nikunj
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index d972ab4956d4..a962bed97a0b 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -66,14 +66,6 @@  static unsigned int nr_asids;
 static unsigned long *sev_asid_bitmap;
 static unsigned long *sev_reclaim_asid_bitmap;
 
-struct enc_region {
-	struct list_head list;
-	unsigned long npages;
-	struct page **pages;
-	unsigned long uaddr;
-	unsigned long size;
-};
-
 /* Called with the sev_bitmap_lock held, or on shutdown  */
 static int sev_flush_asids(int min_asid, int max_asid)
 {
@@ -257,8 +249,6 @@  static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	if (ret)
 		goto e_free;
 
-	INIT_LIST_HEAD(&sev->regions_list);
-
 	return 0;
 
 e_free:
@@ -1637,8 +1627,6 @@  static void sev_migrate_from(struct kvm_sev_info *dst,
 	src->handle = 0;
 	src->pages_locked = 0;
 	src->enc_context_owner = NULL;
-
-	list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
 }
 
 static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1861,115 +1849,13 @@  int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 int svm_register_enc_region(struct kvm *kvm,
 			    struct kvm_enc_region *range)
 {
-	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-	struct enc_region *region;
-	int ret = 0;
-
-	if (!sev_guest(kvm))
-		return -ENOTTY;
-
-	/* If kvm is mirroring encryption context it isn't responsible for it */
-	if (is_mirroring_enc_context(kvm))
-		return -EINVAL;
-
-	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
-		return -EINVAL;
-
-	region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
-	if (!region)
-		return -ENOMEM;
-
-	mutex_lock(&kvm->lock);
-	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
-	if (IS_ERR(region->pages)) {
-		ret = PTR_ERR(region->pages);
-		mutex_unlock(&kvm->lock);
-		goto e_free;
-	}
-
-	region->uaddr = range->addr;
-	region->size = range->size;
-
-	list_add_tail(&region->list, &sev->regions_list);
-	mutex_unlock(&kvm->lock);
-
-	/*
-	 * The guest may change the memory encryption attribute from C=0 -> C=1
-	 * or vice versa for this memory range. Lets make sure caches are
-	 * flushed to ensure that guest data gets written into memory with
-	 * correct C-bit.
-	 */
-	sev_clflush_pages(region->pages, region->npages);
-
-	return ret;
-
-e_free:
-	kfree(region);
-	return ret;
-}
-
-static struct enc_region *
-find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
-{
-	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-	struct list_head *head = &sev->regions_list;
-	struct enc_region *i;
-
-	list_for_each_entry(i, head, list) {
-		if (i->uaddr == range->addr &&
-		    i->size == range->size)
-			return i;
-	}
-
-	return NULL;
-}
-
-static void __unregister_enc_region_locked(struct kvm *kvm,
-					   struct enc_region *region)
-{
-	sev_unpin_memory(kvm, region->pages, region->npages);
-	list_del(&region->list);
-	kfree(region);
+	return 0;
 }
 
 int svm_unregister_enc_region(struct kvm *kvm,
 			      struct kvm_enc_region *range)
 {
-	struct enc_region *region;
-	int ret;
-
-	/* If kvm is mirroring encryption context it isn't responsible for it */
-	if (is_mirroring_enc_context(kvm))
-		return -EINVAL;
-
-	mutex_lock(&kvm->lock);
-
-	if (!sev_guest(kvm)) {
-		ret = -ENOTTY;
-		goto failed;
-	}
-
-	region = find_enc_region(kvm, range);
-	if (!region) {
-		ret = -EINVAL;
-		goto failed;
-	}
-
-	/*
-	 * Ensure that all guest tagged cache entries are flushed before
-	 * releasing the pages back to the system for use. CLFLUSH will
-	 * not do this, so issue a WBINVD.
-	 */
-	wbinvd_on_all_cpus();
-
-	__unregister_enc_region_locked(kvm, region);
-
-	mutex_unlock(&kvm->lock);
 	return 0;
-
-failed:
-	mutex_unlock(&kvm->lock);
-	return ret;
 }
 
 int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
@@ -2018,7 +1904,6 @@  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
 	mirror_sev->fd = source_sev->fd;
 	mirror_sev->es_active = source_sev->es_active;
 	mirror_sev->handle = source_sev->handle;
-	INIT_LIST_HEAD(&mirror_sev->regions_list);
 	ret = 0;
 
 	/*
@@ -2038,8 +1923,6 @@  int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
 void sev_vm_destroy(struct kvm *kvm)
 {
 	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-	struct list_head *head = &sev->regions_list;
-	struct list_head *pos, *q;
 
 	WARN_ON(sev->num_mirrored_vms);
 
@@ -2066,18 +1949,6 @@  void sev_vm_destroy(struct kvm *kvm)
 	 */
 	wbinvd_on_all_cpus();
 
-	/*
-	 * if userspace was terminated before unregistering the memory regions
-	 * then lets unpin all the registered memory.
-	 */
-	if (!list_empty(head)) {
-		list_for_each_safe(pos, q, head) {
-			__unregister_enc_region_locked(kvm,
-				list_entry(pos, struct enc_region, list));
-			cond_resched();
-		}
-	}
-
 	sev_unbind_asid(kvm, sev->handle);
 	sev_asid_free(sev);
 }
@@ -2946,13 +2817,90 @@  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
 }
 
+void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+		  kvm_pfn_t pfn)
+{
+	struct kvm_arch_memory_slot *aslot;
+	struct kvm_memory_slot *slot;
+	gfn_t rel_gfn, pin_pfn;
+	unsigned long npages;
+	kvm_pfn_t old_pfn;
+	int i;
+
+	if (!sev_guest(kvm))
+		return;
+
+	if (WARN_ON_ONCE(is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)))
+		return;
+
+	/* Tested till 1GB pages */
+	if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
+		return;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	if (!slot || !slot->arch.pfns)
+		return;
+
+	/*
+	 * Use relative gfn index within the memslot for the bitmap as well as
+	 * the pfns array
+	 */
+	rel_gfn = gfn - slot->base_gfn;
+	aslot = &slot->arch;
+	pin_pfn = pfn;
+	npages = KVM_PAGES_PER_HPAGE(level);
+
+	/* Pin the page, KVM doesn't yet support page migration. */
+	for (i = 0; i < npages; i++, rel_gfn++, pin_pfn++) {
+		if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
+			old_pfn = aslot->pfns[rel_gfn];
+			if (old_pfn == pin_pfn)
+				continue;
+
+			put_page(pfn_to_page(old_pfn));
+		}
+
+		set_bit(rel_gfn, aslot->pinned_bitmap);
+		aslot->pfns[rel_gfn] = pin_pfn;
+		get_page(pfn_to_page(pin_pfn));
+	}
+
+	/*
+	 * Flush any cached lines of the page being added since "ownership" of
+	 * it will be transferred from the host to an encrypted guest.
+	 */
+	clflush_cache_range(__va(pfn << PAGE_SHIFT), page_level_size(level));
+}
+
 void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
 	struct kvm_arch_memory_slot *aslot = &slot->arch;
+	kvm_pfn_t *pfns;
+	gfn_t gfn;
+	int i;
 
 	if (!sev_guest(kvm))
 		return;
 
+	if (!aslot->pinned_bitmap || !slot->arch.pfns)
+		goto out;
+
+	pfns = aslot->pfns;
+
+	/*
+	 * Iterate the memslot to find the pinned pfn using the bitmap and drop
+	 * the pfn stored.
+	 */
+	for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) {
+		if (test_and_clear_bit(i, aslot->pinned_bitmap)) {
+			if (WARN_ON(!pfns[i]))
+				continue;
+
+			put_page(pfn_to_page(pfns[i]));
+		}
+	}
+
+out:
 	if (aslot->pinned_bitmap) {
 		kvfree(aslot->pinned_bitmap);
 		aslot->pinned_bitmap = NULL;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3fb19974f719..22535c680b3f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4743,6 +4743,7 @@  static struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.alloc_memslot_metadata = sev_alloc_memslot_metadata,
 	.free_memslot = sev_free_memslot,
+	.pin_spte = sev_pin_spte,
 };
 
 static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index b2f8b3b52680..c731bc91ea8f 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -77,7 +77,6 @@  struct kvm_sev_info {
 	unsigned int handle;	/* SEV firmware handle */
 	int fd;			/* SEV device fd */
 	unsigned long pages_locked; /* Number of pages locked */
-	struct list_head regions_list;  /* List of registered regions */
 	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
 	struct kvm *enc_context_owner; /* Owner of copied encryption context */
 	unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
@@ -648,5 +647,7 @@  int sev_alloc_memslot_metadata(struct kvm *kvm,
 			       struct kvm_memory_slot *new);
 void sev_free_memslot(struct kvm *kvm,
 		      struct kvm_memory_slot *slot);
+void sev_pin_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+		  kvm_pfn_t pfn);
 
 #endif