Message ID | 20240227232100.478238-19-pbonzini@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | TDX/SNP part 1 of n, for 6.9 | expand |
On Tue, Feb 27, 2024 at 06:20:57PM -0500, Paolo Bonzini <pbonzini@redhat.com> wrote: > guest_memfd pages are generally expected to be in some arch-defined > initial state prior to using them for guest memory. For SEV-SNP this > initial state is 'private', or 'guest-owned', and requires additional > operations to move these pages into a 'private' state by updating the > corresponding entries the RMP table. > > Allow for an arch-defined hook to handle updates of this sort, and go > ahead and implement one for x86 so KVM implementations like AMD SVM can > register a kvm_x86_ops callback to handle these updates for SEV-SNP > guests. > > The preparation callback is always called when allocating/grabbing > folios via gmem, and it is up to the architecture to keep track of > whether or not the pages are already in the expected state (e.g. the RMP > table in the case of SEV-SNP). > > In some cases, it is necessary to defer the preparation of the pages to > handle things like in-place encryption of initial guest memory payloads > before marking these pages as 'private'/'guest-owned', so also add a > helper that performs the same function as kvm_gmem_get_pfn(), but allows > for the preparation callback to be bypassed to allow for pages to be > accessed beforehand. > > Link: https://lore.kernel.org/lkml/ZLqVdvsF11Ddo7Dq@google.com/ > Co-developed-by: Michael Roth <michael.roth@amd.com> > Signed-off-by: Michael Roth <michael.roth@amd.com> > Message-Id: <20231230172351.574091-5-michael.roth@amd.com> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > arch/x86/include/asm/kvm-x86-ops.h | 1 + > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/kvm/x86.c | 6 +++ > include/linux/kvm_host.h | 14 ++++++ > virt/kvm/Kconfig | 4 ++ > virt/kvm/guest_memfd.c | 72 +++++++++++++++++++++++++++--- > 6 files changed, 92 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h > index ac8b7614e79d..adfaad15e7e6 100644 > --- a/arch/x86/include/asm/kvm-x86-ops.h > +++ b/arch/x86/include/asm/kvm-x86-ops.h > @@ -139,6 +139,7 @@ KVM_X86_OP(complete_emulated_msr) > KVM_X86_OP(vcpu_deliver_sipi_vector) > KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); > KVM_X86_OP_OPTIONAL(get_untagged_addr) > +KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) > > #undef KVM_X86_OP > #undef KVM_X86_OP_OPTIONAL > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 7de8a3f2a118..6d873d08f739 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1804,6 +1804,7 @@ struct kvm_x86_ops { > unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); > > gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); > + int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); > }; > > struct kvm_x86_nested_ops { > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index f10a5a617120..eff532ea59c9 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -13598,6 +13598,12 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) > } > EXPORT_SYMBOL_GPL(kvm_arch_no_poll); > > +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE > +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) > +{ > + return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order); > +} > +#endif > > int kvm_spec_ctrl_test_value(u64 value) > { > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 97afe4519772..03bf616b7308 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2434,6 +2434,8 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > #ifdef CONFIG_KVM_PRIVATE_MEM > int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, > gfn_t gfn, kvm_pfn_t *pfn, int *max_order); > +int kvm_gmem_get_uninit_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, > + gfn_t gfn, kvm_pfn_t *pfn, int *max_order); > #else > static inline int kvm_gmem_get_pfn(struct kvm *kvm, > struct kvm_memory_slot *slot, gfn_t gfn, > @@ -2442,6 +2444,18 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, > KVM_BUG_ON(1, kvm); > return -EIO; > } > + > +static inline int kvm_gmem_get_uninit_pfn(struct kvm *kvm, > + struct kvm_memory_slot *slot, gfn_t gfn, > + kvm_pfn_t *pfn, int *max_order) > +{ > + KVM_BUG_ON(1, kvm); > + return -EIO; > +} > #endif /* CONFIG_KVM_PRIVATE_MEM */ > > +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE > +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); > +#endif > + > #endif > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index a11e9c80fac9..dcce0c3b5b13 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -111,3 +111,7 @@ config KVM_GENERIC_PRIVATE_MEM > select KVM_GENERIC_MEMORY_ATTRIBUTES > select KVM_PRIVATE_MEM > bool > + > +config HAVE_KVM_GMEM_PREPARE > + bool > + depends on KVM_PRIVATE_MEM > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index de0d5a5c210c..7ec7afafc960 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -13,12 +13,50 @@ struct kvm_gmem { > struct list_head entry; > }; > > -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) > +static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) > +{ > +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE > + struct list_head *gmem_list = &inode->i_mapping->i_private_list; > + struct kvm_gmem *gmem; > + > + list_for_each_entry(gmem, gmem_list, entry) { > + struct kvm_memory_slot *slot; > + struct kvm *kvm = gmem->kvm; > + struct page *page; > + kvm_pfn_t pfn; > + gfn_t gfn; > + int rc; > + > + slot = xa_load(&gmem->bindings, index); > + if (!slot) > + continue; > + > + page = folio_file_page(folio, index); > + pfn = page_to_pfn(page); > + gfn = slot->base_gfn + index - slot->gmem.pgoff; > + rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page))); > + if (rc) { > + pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx, error %d.\n", > + index, rc); > + return rc; > + } > + } > + > +#endif > + return 0; > +} Can we make it conditional? TDX doesn't need prepare hook to set gmem_parepare = NULL. With large memory guest(several hundreds Gbyte) to lookup page cache, this loop slows down guest startup. I think it would also applies to SW_PROTECTED_VM (and pKVM in future). diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3835732491b9..cafb8d0997b5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -842,6 +842,9 @@ struct kvm { #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES /* Protected by slots_locks (for writes) and RCU (for reads) */ struct xarray mem_attr_array; +#endif +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE + bool gmem_need_prepare; #endif char stats_id[KVM_STATS_NAME_SIZE]; }; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 74e19170af8a..ab7d0f7d3d38 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -16,6 +16,7 @@ struct kvm_gmem { static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) { #ifdef CONFIG_HAVE_KVM_GMEM_PREPARE + rc = kvm_arch_gmem_prepare(inode, index, folio); struct list_head *gmem_list = &inode->i_mapping->i_private_list; struct kvm_gmem *gmem; @@ -27,6 +28,9 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol gfn_t gfn; int rc; + if (!kvm->gmem_need_prepare) + continue; + slot = xa_load(&gmem->bindings, index); if (!slot) continue;
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index ac8b7614e79d..adfaad15e7e6 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -139,6 +139,7 @@ KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) +KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7de8a3f2a118..6d873d08f739 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1804,6 +1804,7 @@ struct kvm_x86_ops { unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); + int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); }; struct kvm_x86_nested_ops { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f10a5a617120..eff532ea59c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13598,6 +13598,12 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) +{ + return static_call(kvm_x86_gmem_prepare)(kvm, pfn, gfn, max_order); +} +#endif int kvm_spec_ctrl_test_value(u64 value) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 97afe4519772..03bf616b7308 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2434,6 +2434,8 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order); +int kvm_gmem_get_uninit_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, @@ -2442,6 +2444,18 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, KVM_BUG_ON(1, kvm); return -EIO; } + +static inline int kvm_gmem_get_uninit_pfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t *pfn, int *max_order) +{ + KVM_BUG_ON(1, kvm); + return -EIO; +} #endif /* CONFIG_KVM_PRIVATE_MEM */ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); +#endif + #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index a11e9c80fac9..dcce0c3b5b13 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -111,3 +111,7 @@ config KVM_GENERIC_PRIVATE_MEM select KVM_GENERIC_MEMORY_ATTRIBUTES select KVM_PRIVATE_MEM bool + +config HAVE_KVM_GMEM_PREPARE + bool + depends on KVM_PRIVATE_MEM diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index de0d5a5c210c..7ec7afafc960 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -13,12 +13,50 @@ struct kvm_gmem { struct list_head entry; }; -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct folio *folio) +{ +#ifdef CONFIG_HAVE_KVM_GMEM_PREPARE + struct list_head *gmem_list = &inode->i_mapping->i_private_list; + struct kvm_gmem *gmem; + + list_for_each_entry(gmem, gmem_list, entry) { + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + int rc; + + slot = xa_load(&gmem->bindings, index); + if (!slot) + continue; + + page = folio_file_page(folio, index); + pfn = page_to_pfn(page); + gfn = slot->base_gfn + index - slot->gmem.pgoff; + rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, compound_order(compound_head(page))); + if (rc) { + pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx, error %d.\n", + index, rc); + return rc; + } + } + +#endif + return 0; +} + +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) { struct folio *folio; + fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + + if (!prepare) + fgp_flags |= FGP_CREAT_ONLY; /* TODO: Support huge pages. */ - folio = filemap_grab_folio(inode->i_mapping, index); + folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, + mapping_gfp_mask(inode->i_mapping)); if (IS_ERR_OR_NULL(folio)) return folio; @@ -41,6 +79,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) folio_mark_uptodate(folio); } + if (prepare) { + int r = kvm_gmem_prepare_folio(inode, index, folio); + if (r < 0) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(r); + } + } + /* * Ignore accessed, referenced, and dirty flags. The memory is * unevictable and there is no storage to write back to. @@ -145,7 +192,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) break; } - folio = kvm_gmem_get_folio(inode, index); + folio = kvm_gmem_get_folio(inode, index, true); if (IS_ERR_OR_NULL(folio)) { r = folio ? PTR_ERR(folio) : -ENOMEM; break; @@ -482,8 +529,8 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) fput(file); } -int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +static int __kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem; @@ -503,7 +550,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, goto out_fput; } - folio = kvm_gmem_get_folio(file_inode(file), index); + folio = kvm_gmem_get_folio(file_inode(file), index, prepare); if (!folio) { r = -ENOMEM; goto out_fput; @@ -529,4 +576,17 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, return r; } + +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +{ + return __kvm_gmem_get_pfn(kvm, slot, gfn, pfn, max_order, true); +} EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); + +int kvm_gmem_get_uninit_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +{ + return __kvm_gmem_get_pfn(kvm, slot, gfn, pfn, max_order, false); +} +EXPORT_SYMBOL_GPL(kvm_gmem_get_uninit_pfn);