Message ID | 20210416154106.23721-14-kirill.shutemov@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | TDX and guest memory unmapping | expand |
On Fri, Apr 16, 2021, Kirill A. Shutemov wrote: > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 1b404e4d7dd8..f8183386abe7 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -8170,6 +8170,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) > kvm_sched_yield(vcpu->kvm, a0); > ret = 0; > break; > + case KVM_HC_ENABLE_MEM_PROTECTED: > + ret = kvm_protect_memory(vcpu->kvm); > + break; > + case KVM_HC_MEM_SHARE: > + ret = kvm_share_memory(vcpu->kvm, a0, a1); Can you take a look at a proposed hypercall interface for SEV live migration and holler if you (or anyone else) thinks it will have extensibility issues? https://lkml.kernel.org/r/93d7f2c2888315adc48905722574d89699edde33.1618498113.git.ashish.kalra@amd.com > + break; > default: > ret = -KVM_ENOSYS; > break; > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index fadaccb95a4c..cd2374802702 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -436,6 +436,8 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) > } > #endif > > +#define KVM_NR_SHARED_RANGES 32 > + > /* > * Note: > * memslots are not sorted by id anymore, please use id_to_memslot() > @@ -513,6 +515,10 @@ struct kvm { > pid_t userspace_pid; > unsigned int max_halt_poll_ns; > u32 dirty_ring_size; > + bool mem_protected; > + void *id; > + int nr_shared_ranges; > + struct range shared_ranges[KVM_NR_SHARED_RANGES]; Hard no for me. IMO, anything that requires KVM to track shared/pinned pages in a separate tree/array is non-starter. More specific to TDX #MCs, KVM should not be the canonical reference for the state of a page. > }; > > #define kvm_err(fmt, ...) \ ... > @@ -1868,11 +1874,17 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, > flags |= FOLL_WRITE; > if (async) > flags |= FOLL_NOWAIT; > + if (kvm->mem_protected) > + flags |= FOLL_ALLOW_POISONED; This is unsafe, only the flows that are mapping the PFN into the guest should use ALLOW_POISONED, e.g. __kvm_map_gfn() should fail on a poisoned page. > > npages = get_user_pages_unlocked(addr, 1, &page, flags); > if (npages != 1) > return npages; > > + if (IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY) && > + kvm->mem_protected && !kvm_page_allowed(kvm, page)) > + return -EHWPOISON; > + > /* map read fault as writable if possible */ > if (unlikely(!write_fault) && writable) { > struct page *wpage; ... > @@ -2338,19 +2350,93 @@ static int next_segment(unsigned long len, int offset) > return len; > } > > -static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, > - void *data, int offset, int len) > +int copy_from_guest(struct kvm *kvm, void *data, unsigned long hva, int len) > +{ > + int offset = offset_in_page(hva); > + struct page *page; > + int npages, seg; > + void *vaddr; > + > + if (!IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY) || > + !kvm->mem_protected) { > + return __copy_from_user(data, (void __user *)hva, len); > + } > + > + might_fault(); > + kasan_check_write(data, len); > + check_object_size(data, len, false); > + > + while ((seg = next_segment(len, offset)) != 0) { > + npages = get_user_pages_unlocked(hva, 1, &page, > + FOLL_ALLOW_POISONED); > + if (npages != 1) > + return -EFAULT; > + > + if (!kvm_page_allowed(kvm, page)) > + return -EFAULT; > + > + vaddr = kmap_atomic(page); > + memcpy(data, vaddr + offset, seg); > + kunmap_atomic(vaddr); Why is KVM allowed to access a poisoned page? I would expect shared pages to _not_ be poisoned. Except for pure software emulation of SEV, KVM can't access guest private memory. > + > + put_page(page); > + len -= seg; > + hva += seg; > + data += seg; > + offset = 0; > + } > + > + return 0; > +} ... > @@ -2693,6 +2775,41 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) > } > EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); > > +int kvm_protect_memory(struct kvm *kvm) > +{ > + if (mmap_write_lock_killable(kvm->mm)) > + return -KVM_EINTR; > + > + kvm->mem_protected = true; > + kvm_arch_flush_shadow_all(kvm); > + mmap_write_unlock(kvm->mm); > + > + return 0; > +} > + > +int kvm_share_memory(struct kvm *kvm, unsigned long gfn, unsigned long npages) > +{ > + unsigned long end = gfn + npages; > + > + if (!npages || !IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY)) > + return 0; > + > + __kvm_share_memory(kvm, gfn, end); > + > + for (; gfn < end; gfn++) { > + struct page *page = gfn_to_page(kvm, gfn); > + unsigned long pfn = page_to_pfn(page); > + > + if (page == KVM_ERR_PTR_BAD_PAGE) > + continue; > + > + kvm_share_pfn(kvm, pfn); I like the idea of using "special" PTE value to denote guest private memory, e.g. in this RFC, HWPOISON. But I strongly dislike having KVM involved in the manipulation of the special flag/value. Today, userspace owns the gfn->hva translations and the kernel effectively owns the hva->pfn translations (with input from userspace). KVM just connects the dots. Having KVM own the shared/private transitions means KVM is now part owner of the entire gfn->hva->pfn translation, i.e. KVM is effectively now a secondary MMU and a co-owner of the primary MMU. This creates locking madness, e.g. KVM taking mmap_sem for write, mmu_lock under page lock, etc..., and also takes control away from userspace. E.g. userspace strategy could be to use a separate backing/pool for shared memory and change the gfn->hva translation (memslots) in reaction to a shared/private conversion. Automatically swizzling things in KVM takes away that option. IMO, KVM should be entirely "passive" in this process, e.g. the guest shares or protects memory, userspace calls into the kernel to change state, and the kernel manages the page tables to prevent bad actors. KVM simply does the plumbing for the guest page tables. > + put_page(page); > + } > + > + return 0; > +} > + > void kvm_sigset_activate(struct kvm_vcpu *vcpu) > { > if (!vcpu->sigset_active) > diff --git a/virt/kvm/mem_protected.c b/virt/kvm/mem_protected.c > new file mode 100644 > index 000000000000..81882bd3232b > --- /dev/null > +++ b/virt/kvm/mem_protected.c > @@ -0,0 +1,110 @@ > +#include <linux/kvm_host.h> > +#include <linux/mm.h> > +#include <linux/rmap.h> > + > +static DEFINE_XARRAY(kvm_pfn_map); > + > +static bool gfn_is_shared(struct kvm *kvm, unsigned long gfn) > +{ > + bool ret = false; > + int i; > + > + spin_lock(&kvm->mmu_lock); Taking mmu_lock for write in the page fault path is a non-starter. The TDP MMU is being converted to support concurrent faults, which is a foundational pillar of its scalability. > + for (i = 0; i < kvm->nr_shared_ranges; i++) { > + if (gfn < kvm->shared_ranges[i].start) > + continue; > + if (gfn >= kvm->shared_ranges[i].end) > + continue; > + > + ret = true; > + break; > + } > + spin_unlock(&kvm->mmu_lock); > + > + return ret; > +}
On 4/17/2021 1:30 AM, Sean Christopherson wrote: > On Fri, Apr 16, 2021, Kirill A. Shutemov wrote: [...] >> index fadaccb95a4c..cd2374802702 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -436,6 +436,8 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) >> } >> #endif >> >> +#define KVM_NR_SHARED_RANGES 32 >> + >> /* >> * Note: >> * memslots are not sorted by id anymore, please use id_to_memslot() >> @@ -513,6 +515,10 @@ struct kvm { >> pid_t userspace_pid; >> unsigned int max_halt_poll_ns; >> u32 dirty_ring_size; >> + bool mem_protected; >> + void *id; >> + int nr_shared_ranges; >> + struct range shared_ranges[KVM_NR_SHARED_RANGES]; > > Hard no for me. IMO, anything that requires KVM to track shared/pinned pages in > a separate tree/array is non-starter. More specific to TDX #MCs, KVM should not > be the canonical reference for the state of a page. > Do you mean something in struct page to track if the page is shared or private?
On Fri, Apr 16, 2021 at 05:30:30PM +0000, Sean Christopherson wrote: > On Fri, Apr 16, 2021, Kirill A. Shutemov wrote: > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 1b404e4d7dd8..f8183386abe7 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -8170,6 +8170,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) > > kvm_sched_yield(vcpu->kvm, a0); > > ret = 0; > > break; > > + case KVM_HC_ENABLE_MEM_PROTECTED: > > + ret = kvm_protect_memory(vcpu->kvm); > > + break; > > + case KVM_HC_MEM_SHARE: > > + ret = kvm_share_memory(vcpu->kvm, a0, a1); > > Can you take a look at a proposed hypercall interface for SEV live migration and > holler if you (or anyone else) thinks it will have extensibility issues? > > https://lkml.kernel.org/r/93d7f2c2888315adc48905722574d89699edde33.1618498113.git.ashish.kalra@amd.com Will look closer. Thanks. > > @@ -1868,11 +1874,17 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, > > flags |= FOLL_WRITE; > > if (async) > > flags |= FOLL_NOWAIT; > > + if (kvm->mem_protected) > > + flags |= FOLL_ALLOW_POISONED; > > This is unsafe, only the flows that are mapping the PFN into the guest should > use ALLOW_POISONED, e.g. __kvm_map_gfn() should fail on a poisoned page. That's true for TDX. I prototyped with pure KVM with minimal modification to the guest. We had to be more permissive for the reason. It will go away for TDX. > > -static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, > > - void *data, int offset, int len) > > +int copy_from_guest(struct kvm *kvm, void *data, unsigned long hva, int len) > > +{ > > + int offset = offset_in_page(hva); > > + struct page *page; > > + int npages, seg; > > + void *vaddr; > > + > > + if (!IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY) || > > + !kvm->mem_protected) { > > + return __copy_from_user(data, (void __user *)hva, len); > > + } > > + > > + might_fault(); > > + kasan_check_write(data, len); > > + check_object_size(data, len, false); > > + > > + while ((seg = next_segment(len, offset)) != 0) { > > + npages = get_user_pages_unlocked(hva, 1, &page, > > + FOLL_ALLOW_POISONED); > > + if (npages != 1) > > + return -EFAULT; > > + > > + if (!kvm_page_allowed(kvm, page)) > > + return -EFAULT; > > + > > + vaddr = kmap_atomic(page); > > + memcpy(data, vaddr + offset, seg); > > + kunmap_atomic(vaddr); > > Why is KVM allowed to access a poisoned page? I would expect shared pages to > _not_ be poisoned. Except for pure software emulation of SEV, KVM can't access > guest private memory. Again, it's not going to be in TDX implementation. > I like the idea of using "special" PTE value to denote guest private memory, > e.g. in this RFC, HWPOISON. But I strongly dislike having KVM involved in the > manipulation of the special flag/value. > > Today, userspace owns the gfn->hva translations and the kernel effectively owns > the hva->pfn translations (with input from userspace). KVM just connects the > dots. > > Having KVM own the shared/private transitions means KVM is now part owner of the > entire gfn->hva->pfn translation, i.e. KVM is effectively now a secondary MMU > and a co-owner of the primary MMU. This creates locking madness, e.g. KVM taking > mmap_sem for write, mmu_lock under page lock, etc..., and also takes control away > from userspace. E.g. userspace strategy could be to use a separate backing/pool > for shared memory and change the gfn->hva translation (memslots) in reaction to > a shared/private conversion. Automatically swizzling things in KVM takes away > that option. > > IMO, KVM should be entirely "passive" in this process, e.g. the guest shares or > protects memory, userspace calls into the kernel to change state, and the kernel > manages the page tables to prevent bad actors. KVM simply does the plumbing for > the guest page tables. That's a new perspective for me. Very interesting. Let's see how it can look like: - KVM only allows poisoned pages (or whatever flag we end up using for protection) in the private mappings. SIGBUS otherwise. - Poisoned pages must be tied to the KVM instance to be allowed in the private mappings. Like kvm->id in the current prototype. SIGBUS otherwise. - Pages get poisoned on fault in if the VMA has a new vmflag set. - Fault in of a poisoned page leads to hwpoison entry. Userspace cannot access such pages. - Poisoned pages produced this way get unpoisoned on free. - The new VMA flag set by userspace. mprotect(2)? - Add a new GUP flag to retrive such pages from the userspace mapping. Used only for private mapping population. - Shared gfn ranges managed by userspace, based on hypercalls from the guest. - Shared mappings get populated via normal VMA. Any poisoned pages here would lead to SIGBUS. So far it looks pretty straight-forward. The only thing that I don't understand is at way point the page gets tied to the KVM instance. Currently we do it just before populating shadow entries, but it would not work with the new scheme: as we poison pages on fault it they may never get inserted into shadow entries. That's not good as we rely on the info to unpoison page on free. Maybe we should tie VMA to the KVM instance on setting the vmflags? I donno. Any comments?
On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > On Fri, Apr 16, 2021 at 05:30:30PM +0000, Sean Christopherson wrote: > > I like the idea of using "special" PTE value to denote guest private memory, > > e.g. in this RFC, HWPOISON. But I strongly dislike having KVM involved in the > > manipulation of the special flag/value. > > > > Today, userspace owns the gfn->hva translations and the kernel effectively owns > > the hva->pfn translations (with input from userspace). KVM just connects the > > dots. > > > > Having KVM own the shared/private transitions means KVM is now part owner of the > > entire gfn->hva->pfn translation, i.e. KVM is effectively now a secondary MMU > > and a co-owner of the primary MMU. This creates locking madness, e.g. KVM taking > > mmap_sem for write, mmu_lock under page lock, etc..., and also takes control away > > from userspace. E.g. userspace strategy could be to use a separate backing/pool > > for shared memory and change the gfn->hva translation (memslots) in reaction to > > a shared/private conversion. Automatically swizzling things in KVM takes away > > that option. > > > > IMO, KVM should be entirely "passive" in this process, e.g. the guest shares or > > protects memory, userspace calls into the kernel to change state, and the kernel > > manages the page tables to prevent bad actors. KVM simply does the plumbing for > > the guest page tables. > > That's a new perspective for me. Very interesting. > > Let's see how it can look like: > > - KVM only allows poisoned pages (or whatever flag we end up using for > protection) in the private mappings. SIGBUS otherwise. > > - Poisoned pages must be tied to the KVM instance to be allowed in the > private mappings. Like kvm->id in the current prototype. SIGBUS > otherwise. > > - Pages get poisoned on fault in if the VMA has a new vmflag set. > > - Fault in of a poisoned page leads to hwpoison entry. Userspace cannot > access such pages. > > - Poisoned pages produced this way get unpoisoned on free. > > - The new VMA flag set by userspace. mprotect(2)? Ya, or mmap(), though I'm not entirely sure a VMA flag would suffice. The notion of the page being private is tied to the PFN, which would suggest "struct page" needs to be involved. But fundamentally the private pages, are well, private. They can't be shared across processes, so I think we could (should?) require the VMA to always be MAP_PRIVATE. Does that buy us enough to rely on the VMA alone? I.e. is that enough to prevent userspace and unaware kernel code from acquiring a reference to the underlying page? > - Add a new GUP flag to retrive such pages from the userspace mapping. > Used only for private mapping population. > - Shared gfn ranges managed by userspace, based on hypercalls from the > guest. > > - Shared mappings get populated via normal VMA. Any poisoned pages here > would lead to SIGBUS. > > So far it looks pretty straight-forward. > > The only thing that I don't understand is at way point the page gets tied > to the KVM instance. Currently we do it just before populating shadow > entries, but it would not work with the new scheme: as we poison pages > on fault it they may never get inserted into shadow entries. That's not > good as we rely on the info to unpoison page on free. Can you elaborate on what you mean by "unpoison"? If the page is never actually mapped into the guest, then its poisoned status is nothing more than a software flag, i.e. nothing extra needs to be done on free. If the page is mapped into the guest, then KVM can be made responsible for reinitializing the page with keyid=0 when the page is removed from the guest. The TDX Module prevents mapping the same PFN into multiple guests, so the kernel doesn't actually have to care _which_ KVM instance(s) is associated with a page, it only needs to prevent installing valid PTEs in the host page tables. > Maybe we should tie VMA to the KVM instance on setting the vmflags? > I donno. > > Any comments? > > -- > Kirill A. Shutemov
On Mon, Apr 19, 2021 at 04:01:46PM +0000, Sean Christopherson wrote: > On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > > On Fri, Apr 16, 2021 at 05:30:30PM +0000, Sean Christopherson wrote: > > > I like the idea of using "special" PTE value to denote guest private memory, > > > e.g. in this RFC, HWPOISON. But I strongly dislike having KVM involved in the > > > manipulation of the special flag/value. > > > > > > Today, userspace owns the gfn->hva translations and the kernel effectively owns > > > the hva->pfn translations (with input from userspace). KVM just connects the > > > dots. > > > > > > Having KVM own the shared/private transitions means KVM is now part owner of the > > > entire gfn->hva->pfn translation, i.e. KVM is effectively now a secondary MMU > > > and a co-owner of the primary MMU. This creates locking madness, e.g. KVM taking > > > mmap_sem for write, mmu_lock under page lock, etc..., and also takes control away > > > from userspace. E.g. userspace strategy could be to use a separate backing/pool > > > for shared memory and change the gfn->hva translation (memslots) in reaction to > > > a shared/private conversion. Automatically swizzling things in KVM takes away > > > that option. > > > > > > IMO, KVM should be entirely "passive" in this process, e.g. the guest shares or > > > protects memory, userspace calls into the kernel to change state, and the kernel > > > manages the page tables to prevent bad actors. KVM simply does the plumbing for > > > the guest page tables. > > > > That's a new perspective for me. Very interesting. > > > > Let's see how it can look like: > > > > - KVM only allows poisoned pages (or whatever flag we end up using for > > protection) in the private mappings. SIGBUS otherwise. > > > > - Poisoned pages must be tied to the KVM instance to be allowed in the > > private mappings. Like kvm->id in the current prototype. SIGBUS > > otherwise. > > > > - Pages get poisoned on fault in if the VMA has a new vmflag set. > > > > - Fault in of a poisoned page leads to hwpoison entry. Userspace cannot > > access such pages. > > > > - Poisoned pages produced this way get unpoisoned on free. > > > > - The new VMA flag set by userspace. mprotect(2)? > > Ya, or mmap(), though I'm not entirely sure a VMA flag would suffice. The > notion of the page being private is tied to the PFN, which would suggest "struct > page" needs to be involved. PG_hwpoison will be set on the page, so it's tied to pfn. > But fundamentally the private pages, are well, private. They can't be shared > across processes, so I think we could (should?) require the VMA to always be > MAP_PRIVATE. Does that buy us enough to rely on the VMA alone? I.e. is that > enough to prevent userspace and unaware kernel code from acquiring a reference > to the underlying page? Shared pages should be fine too (you folks wanted tmpfs support). The poisoned pages must be useless outside of the process with the blessed struct kvm. See kvm_pfn_map in the patch. > > - Add a new GUP flag to retrive such pages from the userspace mapping. > > Used only for private mapping population. > > > - Shared gfn ranges managed by userspace, based on hypercalls from the > > guest. > > > > - Shared mappings get populated via normal VMA. Any poisoned pages here > > would lead to SIGBUS. > > > > So far it looks pretty straight-forward. > > > > The only thing that I don't understand is at way point the page gets tied > > to the KVM instance. Currently we do it just before populating shadow > > entries, but it would not work with the new scheme: as we poison pages > > on fault it they may never get inserted into shadow entries. That's not > > good as we rely on the info to unpoison page on free. > > Can you elaborate on what you mean by "unpoison"? If the page is never actually > mapped into the guest, then its poisoned status is nothing more than a software > flag, i.e. nothing extra needs to be done on free. Normally, poisoned flag preserved for freed pages as it usually indicate hardware issue. In this case we need return page to the normal circulation. So we need a way to differentiate two kinds of page poison. Current patch does this by adding page's pfn to kvm_pfn_map. But this will not work if we uncouple poisoning and adding to shadow PTE.
On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > On Mon, Apr 19, 2021 at 04:01:46PM +0000, Sean Christopherson wrote: > > But fundamentally the private pages, are well, private. They can't be shared > > across processes, so I think we could (should?) require the VMA to always be > > MAP_PRIVATE. Does that buy us enough to rely on the VMA alone? I.e. is that > > enough to prevent userspace and unaware kernel code from acquiring a reference > > to the underlying page? > > Shared pages should be fine too (you folks wanted tmpfs support). Is that a conflict though? If the private->shared conversion request is kicked out to userspace, then userspace can re-mmap() the files as MAP_SHARED, no? Allowing MAP_SHARED for guest private memory feels wrong. The data can't be shared, and dirty data can't be written back to the file. > The poisoned pages must be useless outside of the process with the blessed > struct kvm. See kvm_pfn_map in the patch. The big requirement for kernel TDX support is that the pages are useless in the host. Regarding the guest, for TDX, the TDX Module guarantees that at most a single KVM guest can have access to a page at any given time. I believe the RMP provides the same guarantees for SEV-SNP. SEV/SEV-ES could still end up with corruption if multiple guests map the same private page, but that's obviously not the end of the world since it's the status quo today. Living with that shortcoming might be a worthy tradeoff if punting mutual exclusion between guests to firmware/hardware allows us to simplify the kernel implementation. > > > - Add a new GUP flag to retrive such pages from the userspace mapping. > > > Used only for private mapping population. > > > > > - Shared gfn ranges managed by userspace, based on hypercalls from the > > > guest. > > > > > > - Shared mappings get populated via normal VMA. Any poisoned pages here > > > would lead to SIGBUS. > > > > > > So far it looks pretty straight-forward. > > > > > > The only thing that I don't understand is at way point the page gets tied > > > to the KVM instance. Currently we do it just before populating shadow > > > entries, but it would not work with the new scheme: as we poison pages > > > on fault it they may never get inserted into shadow entries. That's not > > > good as we rely on the info to unpoison page on free. > > > > Can you elaborate on what you mean by "unpoison"? If the page is never actually > > mapped into the guest, then its poisoned status is nothing more than a software > > flag, i.e. nothing extra needs to be done on free. > > Normally, poisoned flag preserved for freed pages as it usually indicate > hardware issue. In this case we need return page to the normal circulation. > So we need a way to differentiate two kinds of page poison. Current patch > does this by adding page's pfn to kvm_pfn_map. But this will not work if > we uncouple poisoning and adding to shadow PTE. Why use PG_hwpoison then?
On 19.04.21 20:09, Sean Christopherson wrote: > On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: >> On Mon, Apr 19, 2021 at 04:01:46PM +0000, Sean Christopherson wrote: >>> But fundamentally the private pages, are well, private. They can't be shared >>> across processes, so I think we could (should?) require the VMA to always be >>> MAP_PRIVATE. Does that buy us enough to rely on the VMA alone? I.e. is that >>> enough to prevent userspace and unaware kernel code from acquiring a reference >>> to the underlying page? >> >> Shared pages should be fine too (you folks wanted tmpfs support). > > Is that a conflict though? If the private->shared conversion request is kicked > out to userspace, then userspace can re-mmap() the files as MAP_SHARED, no? > > Allowing MAP_SHARED for guest private memory feels wrong. The data can't be > shared, and dirty data can't be written back to the file. > >> The poisoned pages must be useless outside of the process with the blessed >> struct kvm. See kvm_pfn_map in the patch. > > The big requirement for kernel TDX support is that the pages are useless in the > host. Regarding the guest, for TDX, the TDX Module guarantees that at most a > single KVM guest can have access to a page at any given time. I believe the RMP > provides the same guarantees for SEV-SNP. > > SEV/SEV-ES could still end up with corruption if multiple guests map the same > private page, but that's obviously not the end of the world since it's the status > quo today. Living with that shortcoming might be a worthy tradeoff if punting > mutual exclusion between guests to firmware/hardware allows us to simplify the > kernel implementation. > >>>> - Add a new GUP flag to retrive such pages from the userspace mapping. >>>> Used only for private mapping population. >>> >>>> - Shared gfn ranges managed by userspace, based on hypercalls from the >>>> guest. >>>> >>>> - Shared mappings get populated via normal VMA. Any poisoned pages here >>>> would lead to SIGBUS. >>>> >>>> So far it looks pretty straight-forward. >>>> >>>> The only thing that I don't understand is at way point the page gets tied >>>> to the KVM instance. Currently we do it just before populating shadow >>>> entries, but it would not work with the new scheme: as we poison pages >>>> on fault it they may never get inserted into shadow entries. That's not >>>> good as we rely on the info to unpoison page on free. >>> >>> Can you elaborate on what you mean by "unpoison"? If the page is never actually >>> mapped into the guest, then its poisoned status is nothing more than a software >>> flag, i.e. nothing extra needs to be done on free. >> >> Normally, poisoned flag preserved for freed pages as it usually indicate >> hardware issue. In this case we need return page to the normal circulation. >> So we need a way to differentiate two kinds of page poison. Current patch >> does this by adding page's pfn to kvm_pfn_map. But this will not work if >> we uncouple poisoning and adding to shadow PTE. > > Why use PG_hwpoison then? > I already raised that reusing PG_hwpoison is not what we want. And I repeat, to me this all looks like a big hack; some things you (Sena) propose sound cleaner, at least to me.
On Mon, Apr 19, 2021 at 06:09:29PM +0000, Sean Christopherson wrote: > On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > > On Mon, Apr 19, 2021 at 04:01:46PM +0000, Sean Christopherson wrote: > > > But fundamentally the private pages, are well, private. They can't be shared > > > across processes, so I think we could (should?) require the VMA to always be > > > MAP_PRIVATE. Does that buy us enough to rely on the VMA alone? I.e. is that > > > enough to prevent userspace and unaware kernel code from acquiring a reference > > > to the underlying page? > > > > Shared pages should be fine too (you folks wanted tmpfs support). > > Is that a conflict though? If the private->shared conversion request is kicked > out to userspace, then userspace can re-mmap() the files as MAP_SHARED, no? > > Allowing MAP_SHARED for guest private memory feels wrong. The data can't be > shared, and dirty data can't be written back to the file. It can be remapped, but faulting in the page would produce hwpoison entry. I don't see other way to make Google's use-case with tmpfs-backed guest memory work. > > The poisoned pages must be useless outside of the process with the blessed > > struct kvm. See kvm_pfn_map in the patch. > > The big requirement for kernel TDX support is that the pages are useless in the > host. Regarding the guest, for TDX, the TDX Module guarantees that at most a > single KVM guest can have access to a page at any given time. I believe the RMP > provides the same guarantees for SEV-SNP. > > SEV/SEV-ES could still end up with corruption if multiple guests map the same > private page, but that's obviously not the end of the world since it's the status > quo today. Living with that shortcoming might be a worthy tradeoff if punting > mutual exclusion between guests to firmware/hardware allows us to simplify the > kernel implementation. The critical question is whether we ever need to translate hva->pfn after the page is added to the guest private memory. I believe we do, but I never checked. And that's the reason we need to keep hwpoison entries around, which encode pfn. If we don't, it would simplify the solution: kvm_pfn_map is not needed. Single bit-per page would be enough. > > > > - Add a new GUP flag to retrive such pages from the userspace mapping. > > > > Used only for private mapping population. > > > > > > > - Shared gfn ranges managed by userspace, based on hypercalls from the > > > > guest. > > > > > > > > - Shared mappings get populated via normal VMA. Any poisoned pages here > > > > would lead to SIGBUS. > > > > > > > > So far it looks pretty straight-forward. > > > > > > > > The only thing that I don't understand is at way point the page gets tied > > > > to the KVM instance. Currently we do it just before populating shadow > > > > entries, but it would not work with the new scheme: as we poison pages > > > > on fault it they may never get inserted into shadow entries. That's not > > > > good as we rely on the info to unpoison page on free. > > > > > > Can you elaborate on what you mean by "unpoison"? If the page is never actually > > > mapped into the guest, then its poisoned status is nothing more than a software > > > flag, i.e. nothing extra needs to be done on free. > > > > Normally, poisoned flag preserved for freed pages as it usually indicate > > hardware issue. In this case we need return page to the normal circulation. > > So we need a way to differentiate two kinds of page poison. Current patch > > does this by adding page's pfn to kvm_pfn_map. But this will not work if > > we uncouple poisoning and adding to shadow PTE. > > Why use PG_hwpoison then? Page flags are scarce. I don't want to take occupy a new one until I'm sure I must. And we can re-use existing infrastructure to SIGBUS on access to such pages.
On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > On Mon, Apr 19, 2021 at 06:09:29PM +0000, Sean Christopherson wrote: > > On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > > > On Mon, Apr 19, 2021 at 04:01:46PM +0000, Sean Christopherson wrote: > > > > But fundamentally the private pages, are well, private. They can't be shared > > > > across processes, so I think we could (should?) require the VMA to always be > > > > MAP_PRIVATE. Does that buy us enough to rely on the VMA alone? I.e. is that > > > > enough to prevent userspace and unaware kernel code from acquiring a reference > > > > to the underlying page? > > > > > > Shared pages should be fine too (you folks wanted tmpfs support). > > > > Is that a conflict though? If the private->shared conversion request is kicked > > out to userspace, then userspace can re-mmap() the files as MAP_SHARED, no? > > > > Allowing MAP_SHARED for guest private memory feels wrong. The data can't be > > shared, and dirty data can't be written back to the file. > > It can be remapped, but faulting in the page would produce hwpoison entry. It sounds like you're thinking the whole tmpfs file is poisoned. My thought is that userspace would need to do something like for guest private memory: mmap(NULL, guest_size, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_GUEST_ONLY, fd, 0); The MAP_GUEST_ONLY would be used by the kernel to ensure the resulting VMA can only point at private/poisoned memory, e.g. on fault, the associated PFN would be tagged with PG_hwpoison or whtaever. @fd in this case could point at tmpfs, but I don't think it's a hard requirement. On conversion to shared, userspace could then do: munmap(<addr>, <size>) mmap(<addr>, <size>, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_FIXED_NOREPLACE, fd, <offset>); or mmap(<addr>, <size>, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, <offset>); or ioctl(kvm, KVM_SET_USER_MEMORY_REGION, <delete private range>); mmap(NULL, <size>, PROT_READ|PROT_WRITE, MAP_SHARED, fd, <offset>); ioctl(kvm, KVM_SET_USER_MEMORY_REGION, <add shared range>); Combinations would also work, e.g. unmap the private range and move the memslot. The private and shared memory regions could also be backed differently, e.g. tmpfs for shared memory, anonymous for private memory. > I don't see other way to make Google's use-case with tmpfs-backed guest > memory work. The underlying use-case is to be able to access guest memory from more than one process, e.g. so that communication with the guest isn't limited to the VMM process associated with the KVM instances. By definition, guest private memory can't be accessed by the host; I don't see how anyone, Google included, can have any real requirements about > > > The poisoned pages must be useless outside of the process with the blessed > > > struct kvm. See kvm_pfn_map in the patch. > > > > The big requirement for kernel TDX support is that the pages are useless in the > > host. Regarding the guest, for TDX, the TDX Module guarantees that at most a > > single KVM guest can have access to a page at any given time. I believe the RMP > > provides the same guarantees for SEV-SNP. > > > > SEV/SEV-ES could still end up with corruption if multiple guests map the same > > private page, but that's obviously not the end of the world since it's the status > > quo today. Living with that shortcoming might be a worthy tradeoff if punting > > mutual exclusion between guests to firmware/hardware allows us to simplify the > > kernel implementation. > > The critical question is whether we ever need to translate hva->pfn after > the page is added to the guest private memory. I believe we do, but I > never checked. And that's the reason we need to keep hwpoison entries > around, which encode pfn. As proposed in the TDX RFC, KVM would "need" the hva->pfn translation if the guest private EPT entry was zapped, e.g. by NUMA balancing (which will fail on the backend). But in that case, KVM still has the original PFN, the "new" translation becomes a sanity check to make sure that the zapped translation wasn't moved unexpectedly. Regardless, I don't see what that has to do with kvm_pfn_map. At some point, gup() has to fault in the page or look at the host PTE value. For the latter, at least on x86, we can throw info into the PTE itself to tag it as guest-only. No matter what implementation we settle on, I think we've failed if we end up in a situation where the primary MMU has pages it doesn't know are guest-only. > If we don't, it would simplify the solution: kvm_pfn_map is not needed. > Single bit-per page would be enough.
On Mon, Apr 19, 2021 at 08:09:13PM +0000, Sean Christopherson wrote: > On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > > On Mon, Apr 19, 2021 at 06:09:29PM +0000, Sean Christopherson wrote: > > > On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > > > > On Mon, Apr 19, 2021 at 04:01:46PM +0000, Sean Christopherson wrote: > > > > > But fundamentally the private pages, are well, private. They can't be shared > > > > > across processes, so I think we could (should?) require the VMA to always be > > > > > MAP_PRIVATE. Does that buy us enough to rely on the VMA alone? I.e. is that > > > > > enough to prevent userspace and unaware kernel code from acquiring a reference > > > > > to the underlying page? > > > > > > > > Shared pages should be fine too (you folks wanted tmpfs support). > > > > > > Is that a conflict though? If the private->shared conversion request is kicked > > > out to userspace, then userspace can re-mmap() the files as MAP_SHARED, no? > > > > > > Allowing MAP_SHARED for guest private memory feels wrong. The data can't be > > > shared, and dirty data can't be written back to the file. > > > > It can be remapped, but faulting in the page would produce hwpoison entry. > > It sounds like you're thinking the whole tmpfs file is poisoned. No. File is not poisoned. Pages got poisoned when they are faulted into flagged VMA. Different VM can use the same file as long as offsets do not overlap. > My thought is that userspace would need to do something like for guest > private memory: > > mmap(NULL, guest_size, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_GUEST_ONLY, fd, 0); > > The MAP_GUEST_ONLY would be used by the kernel to ensure the resulting VMA can > only point at private/poisoned memory, e.g. on fault, the associated PFN would > be tagged with PG_hwpoison or whtaever. @fd in this case could point at tmpfs, > but I don't think it's a hard requirement. > > On conversion to shared, userspace could then do: > > munmap(<addr>, <size>) > mmap(<addr>, <size>, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_FIXED_NOREPLACE, fd, <offset>); > > or > > mmap(<addr>, <size>, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, <offset>); > I played with this variant before, but initiated from kernel. Should work fine. > or > > ioctl(kvm, KVM_SET_USER_MEMORY_REGION, <delete private range>); > mmap(NULL, <size>, PROT_READ|PROT_WRITE, MAP_SHARED, fd, <offset>); > ioctl(kvm, KVM_SET_USER_MEMORY_REGION, <add shared range>); > > Combinations would also work, e.g. unmap the private range and move the memslot. > The private and shared memory regions could also be backed differently, e.g. > tmpfs for shared memory, anonymous for private memory. Right. Kernel has to be flexible enough to provide any of the schemes. > > > I don't see other way to make Google's use-case with tmpfs-backed guest > > memory work. > > The underlying use-case is to be able to access guest memory from more than one > process, e.g. so that communication with the guest isn't limited to the VMM > process associated with the KVM instances. By definition, guest private memory > can't be accessed by the host; I don't see how anyone, Google included, can have > any real requirements about > > > > > The poisoned pages must be useless outside of the process with the blessed > > > > struct kvm. See kvm_pfn_map in the patch. > > > > > > The big requirement for kernel TDX support is that the pages are useless in the > > > host. Regarding the guest, for TDX, the TDX Module guarantees that at most a > > > single KVM guest can have access to a page at any given time. I believe the RMP > > > provides the same guarantees for SEV-SNP. > > > > > > SEV/SEV-ES could still end up with corruption if multiple guests map the same > > > private page, but that's obviously not the end of the world since it's the status > > > quo today. Living with that shortcoming might be a worthy tradeoff if punting > > > mutual exclusion between guests to firmware/hardware allows us to simplify the > > > kernel implementation. > > > > The critical question is whether we ever need to translate hva->pfn after > > the page is added to the guest private memory. I believe we do, but I > > never checked. And that's the reason we need to keep hwpoison entries > > around, which encode pfn. > > As proposed in the TDX RFC, KVM would "need" the hva->pfn translation if the > guest private EPT entry was zapped, e.g. by NUMA balancing (which will fail on > the backend). But in that case, KVM still has the original PFN, the "new" > translation becomes a sanity check to make sure that the zapped translation > wasn't moved unexpectedly. > > Regardless, I don't see what that has to do with kvm_pfn_map. At some point, > gup() has to fault in the page or look at the host PTE value. For the latter, > at least on x86, we can throw info into the PTE itself to tag it as guest-only. > No matter what implementation we settle on, I think we've failed if we end up in > a situation where the primary MMU has pages it doesn't know are guest-only. I try to understand if it's a problem if KVM sees a guest-only PTE, but it's for other VM. Like two VM's try to use the same tmpfs file as guest memory. We cannot insert the pfn into two TD/SEV guest at once, but can it cause other problems? I'm not sure.
On Tue, Apr 20, 2021, Kirill A. Shutemov wrote: > On Mon, Apr 19, 2021 at 08:09:13PM +0000, Sean Christopherson wrote: > > On Mon, Apr 19, 2021, Kirill A. Shutemov wrote: > > > The critical question is whether we ever need to translate hva->pfn after > > > the page is added to the guest private memory. I believe we do, but I > > > never checked. And that's the reason we need to keep hwpoison entries > > > around, which encode pfn. > > > > As proposed in the TDX RFC, KVM would "need" the hva->pfn translation if the > > guest private EPT entry was zapped, e.g. by NUMA balancing (which will fail on > > the backend). But in that case, KVM still has the original PFN, the "new" > > translation becomes a sanity check to make sure that the zapped translation > > wasn't moved unexpectedly. > > > > Regardless, I don't see what that has to do with kvm_pfn_map. At some point, > > gup() has to fault in the page or look at the host PTE value. For the latter, > > at least on x86, we can throw info into the PTE itself to tag it as guest-only. > > No matter what implementation we settle on, I think we've failed if we end up in > > a situation where the primary MMU has pages it doesn't know are guest-only. > > I try to understand if it's a problem if KVM sees a guest-only PTE, but > it's for other VM. Like two VM's try to use the same tmpfs file as guest > memory. We cannot insert the pfn into two TD/SEV guest at once, but can it > cause other problems? I'm not sure. For TDX and SNP, "firmware" will prevent assigning the same PFN to multiple VMs. For SEV and SEV-ES, the PSP (what I'm calling "firmware") will not prevent assigning the same page to multiple guests. But the failure mode in that case, assuming the guests have different ASIDs, is limited to corruption of the guest. On the other hand, for SEV/SEV-ES it's not invalid to assign the same ASID to multiple guests (there's an in-flight patch to do exactly that[*]), and sharing PFNs between guests with the same ASID would also be valid. In other words, if we want to enforce PFN association in the kernel, I think the association should be per-ASID, not per-KVM guest. So, I don't think we _need_ to rely on the TDX/SNP behavior, but if leveraging firmware to handle those checks means avoiding additional complexity in the kernel, then I think it's worth leaning on firmware even if it means SEV/SEV-ES don't enjoy the same level of robustness. [*] https://lkml.kernel.org/r/20210408223214.2582277-1-natet@google.com
Hi Sean, The core patch of the approach we've discussed before is below. It introduce a new page type with the required semantics. The full patchset can be found here: git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git kvm-unmapped-guest-only but only the patch below is relevant for TDX. QEMU patch is attached. CONFIG_HAVE_KVM_PROTECTED_MEMORY has to be changed to what is appropriate for TDX and FOLL_GUEST has to be used in hva_to_pfn_slow() when running TDX guest. When page get inserted into private sept we must make sure it is PageGuest() or SIGBUS otherwise. Inserting PageGuest() into shared is fine, but the page will not be accessible from userspace. Any feedback is welcome. -------------------------------8<------------------------------------------- From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Date: Fri, 16 Apr 2021 01:30:48 +0300 Subject: [PATCH] mm: Introduce guest-only pages PageGuest() pages are only allowed to be used as guest memory. Userspace is not allowed read from or write to such pages. On page fault, PageGuest() pages produce PROT_NONE page table entries. Read or write there will trigger SIGBUS. Access to such pages via syscall leads to -EIO. The new mprotect(2) flag PROT_GUEST translates to VM_GUEST. Any page fault to VM_GUEST VMA produces PageGuest() page. Only shared tmpfs/shmem mappings are supported. GUP normally fails on such pages. KVM will use the new FOLL_GUEST flag to access them. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- include/linux/mm.h | 5 +++- include/linux/mman.h | 7 ++++-- include/linux/page-flags.h | 8 +++++++ include/uapi/asm-generic/mman-common.h | 1 + mm/gup.c | 33 +++++++++++++++++++++----- mm/huge_memory.c | 24 +++++++++++++++---- mm/memory.c | 26 ++++++++++++++------ mm/mprotect.c | 18 ++++++++++++-- mm/shmem.c | 13 ++++++++++ 9 files changed, 113 insertions(+), 22 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ecdf8a8cd6ae..f8024061cece 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -362,6 +362,8 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#define VM_GUEST VM_HIGH_ARCH_4 + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) @@ -413,7 +415,7 @@ extern unsigned int kobjsize(const void *objp); #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif -#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) +#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_GUEST | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the @@ -2802,6 +2804,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_GUEST 0x100000 /* allow access to guest-only pages */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba..204e03d7787c 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -103,7 +103,9 @@ static inline void vm_unacct_memory(long pages) */ static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { - return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0; + int allowed; + allowed = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM | PROT_GUEST; + return (prot & ~allowed) == 0; } #define arch_validate_prot arch_validate_prot #endif @@ -140,7 +142,8 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { return _calc_vm_trans(prot, PROT_READ, VM_READ ) | _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | - _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | + _calc_vm_trans(prot, PROT_EXEC, VM_EXEC ) | + _calc_vm_trans(prot, PROT_GUEST, VM_GUEST) | arch_calc_vm_prot_bits(prot, pkey); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ec5d0290e0ee..f963b96711f5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -436,6 +436,14 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif +#if defined(CONFIG_64BIT) && defined(CONFIG_HAVE_KVM_PROTECTED_MEMORY) +PAGEFLAG(Guest, arch_2, PF_HEAD) +TESTSCFLAG(Guest, arch_2, PF_HEAD) +#else +PAGEFLAG_FALSE(Guest) +TESTSCFLAG_FALSE(Guest) +#endif + #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f94f65d429be..c4d985d22b49 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -16,6 +16,7 @@ #define PROT_NONE 0x0 /* page can not be accessed */ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ +#define PROT_GUEST 0x04000000 /* KVM guest memory */ /* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ diff --git a/mm/gup.c b/mm/gup.c index e4c224cd9661..d581ddb900e5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -357,10 +357,22 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, * FOLL_FORCE can write to even unwritable pte's, but only * after we've gone through a COW cycle and they are dirty. */ -static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +static inline bool can_follow_write_pte(struct vm_area_struct *vma, + pte_t pte, unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + if (pte_write(pte)) + return true; + + if ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)) + return true; + + if (!(flags & FOLL_GUEST)) + return false; + + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) != (VM_WRITE | VM_SHARED)) + return false; + + return PageGuest(pte_page(pte)); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -401,9 +413,18 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, migration_entry_wait(mm, pmd, address); goto retry; } - if ((flags & FOLL_NUMA) && pte_protnone(pte)) - goto no_page; - if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { + + if (pte_protnone(pte)) { + if (flags & FOLL_GUEST) { + page = pte_page(pte); + if (!PageGuest(page)) + goto no_page; + } else if (flags & FOLL_NUMA) { + goto no_page; + } + } + + if ((flags & FOLL_WRITE) && !can_follow_write_pte(vma, pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 91ca9b103ee5..9cd400a44f94 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1335,10 +1335,22 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) * FOLL_FORCE can write to even unwritable pmd's, but only * after we've gone through a COW cycle and they are dirty. */ -static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) +static inline bool can_follow_write_pmd(struct vm_area_struct *vma, + pmd_t pmd, unsigned int flags) { - return pmd_write(pmd) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); + if (pmd_write(pmd)) + return true; + + if ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)) + return true; + + if (!(flags & FOLL_GUEST)) + return false; + + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) != (VM_WRITE | VM_SHARED)) + return false; + + return PageGuest(pmd_page(pmd)); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, @@ -1351,7 +1363,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, assert_spin_locked(pmd_lockptr(mm, pmd)); - if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) + if (flags & FOLL_WRITE && !can_follow_write_pmd(vma, *pmd, flags)) goto out; /* Avoid dumping huge zero page */ @@ -1425,6 +1437,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) bool was_writable; int flags = 0; + page = pmd_page(pmd); + if (PageGuest(page)) + return VM_FAULT_SIGBUS; + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; diff --git a/mm/memory.c b/mm/memory.c index feff48e1465a..590fb43f8296 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3751,9 +3751,13 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) for (i = 0; i < HPAGE_PMD_NR; i++) flush_icache_page(vma, page + i); - entry = mk_huge_pmd(page, vma->vm_page_prot); - if (write) - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (PageGuest(page)) { + entry = mk_huge_pmd(page, PAGE_NONE); + } else { + entry = mk_huge_pmd(page, vma->vm_page_prot); + if (write) + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + } add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); page_add_file_rmap(page, true); @@ -3823,10 +3827,14 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) } flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); - if (write) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (PageGuest(page)) { + entry = mk_pte(page, PAGE_NONE); + } else { + entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); @@ -4185,6 +4193,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; + page = pte_page(vmf->orig_pte); + if (PageGuest(page)) + return VM_FAULT_SIGBUS; + /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but diff --git a/mm/mprotect.c b/mm/mprotect.c index ab709023e9aa..113be3c9ce08 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -484,8 +484,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); - change_protection(vma, start, end, vma->vm_page_prot, - dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + if (vma->vm_flags & VM_GUEST) { + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); + } else { + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + } /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major @@ -603,6 +607,16 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } + if ((newflags & (VM_GUEST|VM_SHARED)) == VM_GUEST) { + error = -EINVAL; + goto out; + } + + if ((newflags & VM_GUEST) && !vma_is_shmem(vma)) { + error = -EINVAL; + goto out; + } + /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; diff --git a/mm/shmem.c b/mm/shmem.c index 7c6b6d8f6c39..75bb56cf78a0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1835,6 +1835,11 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (page && sgp == SGP_WRITE) mark_page_accessed(page); + if (page && PageGuest(page) && sgp != SGP_CACHE) { + error = -EIO; + goto unlock; + } + /* fallocated page? */ if (page && !PageUptodate(page)) { if (sgp != SGP_READ) @@ -2115,6 +2120,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) gfp, vma, vmf, &ret); if (err) return vmf_error(err); + + if (vmf->vma->vm_flags & VM_GUEST) { + if (!TestSetPageGuest(vmf->page)) { + try_to_unmap(compound_head(vmf->page), + TTU_IGNORE_MLOCK); + } + } + return ret; }
On Fri, May 21, 2021, Kirill A. Shutemov wrote: > Hi Sean, > > The core patch of the approach we've discussed before is below. It > introduce a new page type with the required semantics. > > The full patchset can be found here: > > git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git kvm-unmapped-guest-only > > but only the patch below is relevant for TDX. QEMU patch is attached. Can you post the whole series? The KVM behavior and usage of FOLL_GUEST is very relevant to TDX. > CONFIG_HAVE_KVM_PROTECTED_MEMORY has to be changed to what is appropriate > for TDX and FOLL_GUEST has to be used in hva_to_pfn_slow() when running > TDX guest. This behavior in particular is relevant; KVM should provide FOLL_GUEST iff the access is private or the VM type doesn't differentiate between private and shared. > When page get inserted into private sept we must make sure it is > PageGuest() or SIGBUS otherwise. More KVM feedback :-) Ideally, KVM will synchronously exit to userspace with detailed information on the bad behavior, not do SIGBUS. Hopefully that infrastructure will be in place sooner than later. https://lkml.kernel.org/r/YKxJLcg/WomPE422@google.com > Inserting PageGuest() into shared is fine, but the page will not be accessible > from userspace. Even if it can be functionally fine, I don't think we want to allow KVM to map PageGuest() as shared memory. The only reason to map memory shared is to share it with something, e.g. the host, that doesn't have access to private memory, so I can't envision a use case. On the KVM side, it's trivially easy to omit FOLL_GUEST for shared memory, while always passing FOLL_GUEST would require manually zapping. Manual zapping isn't a big deal, but I do think it can be avoided if userspace must either remap the hva or define a new KVM memslot (new gpa->hva), both of which will automatically zap any existing translations. Aha, thought of a concrete problem. If KVM maps PageGuest() into shared memory, then KVM must ensure that the page is not mapped private via a different hva/gpa, and is not mapped _any_ other guest because the TDX-Module's 1:1 PFN:TD+GPA enforcement only applies to private memory. The explicit "VM_WRITE | VM_SHARED" requirement below makes me think this wouldn't be prevented. Oh, and the other nicety is that I think it would avoid having to explicitly handle PageGuest() memory that is being accessed from kernel/KVM, i.e. if all memory exposed to KVM must be !PageGuest(), then it is also eligible for copy_{to,from}_user(). > Any feedback is welcome. > > -------------------------------8<------------------------------------------- > > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> > Date: Fri, 16 Apr 2021 01:30:48 +0300 > Subject: [PATCH] mm: Introduce guest-only pages > > PageGuest() pages are only allowed to be used as guest memory. Userspace > is not allowed read from or write to such pages. > > On page fault, PageGuest() pages produce PROT_NONE page table entries. > Read or write there will trigger SIGBUS. Access to such pages via > syscall leads to -EIO. > > The new mprotect(2) flag PROT_GUEST translates to VM_GUEST. Any page > fault to VM_GUEST VMA produces PageGuest() page. > > Only shared tmpfs/shmem mappings are supported. Is limiting this to tmpfs/shmem only for the PoC/RFC, or is it also expected to be the long-term behavior? > GUP normally fails on such pages. KVM will use the new FOLL_GUEST flag > to access them.
On Wed, May 26, 2021 at 07:46:52PM +0000, Sean Christopherson wrote: > On Fri, May 21, 2021, Kirill A. Shutemov wrote: > > Hi Sean, > > > > The core patch of the approach we've discussed before is below. It > > introduce a new page type with the required semantics. > > > > The full patchset can be found here: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git kvm-unmapped-guest-only > > > > but only the patch below is relevant for TDX. QEMU patch is attached. > > Can you post the whole series? I hoped to get it posted as part of TDX host enabling. As it is the feature is incomplete for pure KVM. I didn't implement on KVM side checks that provided by TDX module/hardware, so nothing prevents the same page to be added to multiple KVM instances. > The KVM behavior and usage of FOLL_GUEST is very relevant to TDX. The patch can be found here: https://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git/commit/?h=kvm-unmapped-guest-only&id=2cd6c2c20528696a46a2a59383ca81638bf856b5 > > CONFIG_HAVE_KVM_PROTECTED_MEMORY has to be changed to what is appropriate > > for TDX and FOLL_GUEST has to be used in hva_to_pfn_slow() when running > > TDX guest. > > This behavior in particular is relevant; KVM should provide FOLL_GUEST iff the > access is private or the VM type doesn't differentiate between private and > shared. I added FOL_GUEST if the KVM instance has the feature enabled. On top of that TDX-specific code has to check that the page is in fact PageGuest() before inserting it into private SEPT. The scheme makes sure that user-accessible memory cannot be not added as private to TD. > > When page get inserted into private sept we must make sure it is > > PageGuest() or SIGBUS otherwise. > > More KVM feedback :-) > > Ideally, KVM will synchronously exit to userspace with detailed information on > the bad behavior, not do SIGBUS. Hopefully that infrastructure will be in place > sooner than later. > > https://lkml.kernel.org/r/YKxJLcg/WomPE422@google.com My experiments are still v5.11, but I can rebase to whatever needed once the infrastructure hits upstream. > > Inserting PageGuest() into shared is fine, but the page will not be accessible > > from userspace. > > Even if it can be functionally fine, I don't think we want to allow KVM to map > PageGuest() as shared memory. The only reason to map memory shared is to share > it with something, e.g. the host, that doesn't have access to private memory, so > I can't envision a use case. > > On the KVM side, it's trivially easy to omit FOLL_GUEST for shared memory, while > always passing FOLL_GUEST would require manually zapping. Manual zapping isn't > a big deal, but I do think it can be avoided if userspace must either remap the > hva or define a new KVM memslot (new gpa->hva), both of which will automatically > zap any existing translations. > > Aha, thought of a concrete problem. If KVM maps PageGuest() into shared memory, > then KVM must ensure that the page is not mapped private via a different hva/gpa, > and is not mapped _any_ other guest because the TDX-Module's 1:1 PFN:TD+GPA > enforcement only applies to private memory. The explicit "VM_WRITE | VM_SHARED" > requirement below makes me think this wouldn't be prevented. Hm. I didn't realize that TDX module doesn't prevent the same page to be used as shared and private at the same time. Omitting FOLL_GUEST for shared memory doesn't look like a right approach. IIUC, it would require the kernel to track what memory is share and what private, which defeat the purpose of the rework. I would rather enforce !PageGuest() when share SEPT is populated in addition to enforcing PageGuest() fro private SEPT. Do you see any problems with this? > Oh, and the other nicety is that I think it would avoid having to explicitly > handle PageGuest() memory that is being accessed from kernel/KVM, i.e. if all > memory exposed to KVM must be !PageGuest(), then it is also eligible for > copy_{to,from}_user(). copy_{to,from}_user() enforce by setting PTE entries to PROT_NONE. Or do I miss your point? > > > Any feedback is welcome. > > > > -------------------------------8<------------------------------------------- > > > > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> > > Date: Fri, 16 Apr 2021 01:30:48 +0300 > > Subject: [PATCH] mm: Introduce guest-only pages > > > > PageGuest() pages are only allowed to be used as guest memory. Userspace > > is not allowed read from or write to such pages. > > > > On page fault, PageGuest() pages produce PROT_NONE page table entries. > > Read or write there will trigger SIGBUS. Access to such pages via > > syscall leads to -EIO. > > > > The new mprotect(2) flag PROT_GUEST translates to VM_GUEST. Any page > > fault to VM_GUEST VMA produces PageGuest() page. > > > > Only shared tmpfs/shmem mappings are supported. > > Is limiting this to tmpfs/shmem only for the PoC/RFC, or is it also expected to > be the long-term behavior? I expect it to be enough to cover all relevant cases, no? Note that MAP_ANONYMOUS|MAP_SHARED also fits here.
On Mon, May 31, 2021, Kirill A. Shutemov wrote: > On Wed, May 26, 2021 at 07:46:52PM +0000, Sean Christopherson wrote: > > On Fri, May 21, 2021, Kirill A. Shutemov wrote: > > > Inserting PageGuest() into shared is fine, but the page will not be accessible > > > from userspace. > > > > Even if it can be functionally fine, I don't think we want to allow KVM to map > > PageGuest() as shared memory. The only reason to map memory shared is to share > > it with something, e.g. the host, that doesn't have access to private memory, so > > I can't envision a use case. > > > > On the KVM side, it's trivially easy to omit FOLL_GUEST for shared memory, while > > always passing FOLL_GUEST would require manually zapping. Manual zapping isn't > > a big deal, but I do think it can be avoided if userspace must either remap the > > hva or define a new KVM memslot (new gpa->hva), both of which will automatically > > zap any existing translations. > > > > Aha, thought of a concrete problem. If KVM maps PageGuest() into shared memory, > > then KVM must ensure that the page is not mapped private via a different hva/gpa, > > and is not mapped _any_ other guest because the TDX-Module's 1:1 PFN:TD+GPA > > enforcement only applies to private memory. The explicit "VM_WRITE | VM_SHARED" > > requirement below makes me think this wouldn't be prevented. > > Hm. I didn't realize that TDX module doesn't prevent the same page to be > used as shared and private at the same time. Ya, only private mappings are routed through the TDX module, e.g. it can prevent mapping the same page as private into multiple guests, but it can't prevent the host from mapping the page as non-private. > Omitting FOLL_GUEST for shared memory doesn't look like a right approach. > IIUC, it would require the kernel to track what memory is share and what > private, which defeat the purpose of the rework. I would rather enforce > !PageGuest() when share SEPT is populated in addition to enforcing > PageGuest() fro private SEPT. Isn't that what omitting FOLL_GUEST would accomplish? For shared memory, including mapping memory into the shared EPT, KVM will omit FOLL_GUEST and thus require the memory to be readable/writable according to the guest access type. By definition, that excludes PageGuest() because PageGuest() pages must always be unmapped, e.g. PROTNONE. And for private EPT, because PageGuest() is always PROTNONE or whatever, it will require FOLL_GUEST to retrieve the PTE/PMD/Pxx. On a semi-related topic, I don't think can_follow_write_pte() is the correct place to hook PageGuest(). TDX's S-EPT has a quirk where all private guest memory must be mapped writable, but that quirk doesn't hold true for non-TDX guests. It should be legal to map private guest memory as read-only. And I believe the below snippet in follow_page_pte() will be problematic too, since FOLL_NUMA is added unless FOLL_FORCE is set. I suspect the correct approach is to handle FOLL_GUEST as an exception to pte_protnone(), though that might require adjusting pte_protnone() to be meaningful even when CONFIG_NUMA_BALANCING=n. if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } > Do you see any problems with this? > > > Oh, and the other nicety is that I think it would avoid having to explicitly > > handle PageGuest() memory that is being accessed from kernel/KVM, i.e. if all > > memory exposed to KVM must be !PageGuest(), then it is also eligible for > > copy_{to,from}_user(). > > copy_{to,from}_user() enforce by setting PTE entries to PROT_NONE. But KVM does _not_ want those PTEs PROT_NONE. If KVM is accessing memory that is also accessible by the the guest, then it must be shared. And if it's shared, it must also be accessible to host userspace, i.e. something other than PROT_NONE, otherwise the memory isn't actually shared with anything. As above, any guest-accessible memory that is accessed by the host must be shared, and so must be mapped with the required permissions.
On Wed, Jun 02, 2021 at 05:51:02PM +0000, Sean Christopherson wrote: > > Omitting FOLL_GUEST for shared memory doesn't look like a right approach. > > IIUC, it would require the kernel to track what memory is share and what > > private, which defeat the purpose of the rework. I would rather enforce > > !PageGuest() when share SEPT is populated in addition to enforcing > > PageGuest() fro private SEPT. > > Isn't that what omitting FOLL_GUEST would accomplish? For shared memory, > including mapping memory into the shared EPT, KVM will omit FOLL_GUEST and thus > require the memory to be readable/writable according to the guest access type. Ah. I guess I see what you're saying: we can pipe down the shared bit from GPA from direct_page_fault() (or whatever handles the fault) down to hva_to_pfn_slow() and omit FOLL_GUEST if the shared bit is set. Right? I guest it's doable, but codeshuffling going to be ugly. > By definition, that excludes PageGuest() because PageGuest() pages must always > be unmapped, e.g. PROTNONE. And for private EPT, because PageGuest() is always > PROTNONE or whatever, it will require FOLL_GUEST to retrieve the PTE/PMD/Pxx. > > On a semi-related topic, I don't think can_follow_write_pte() is the correct > place to hook PageGuest(). TDX's S-EPT has a quirk where all private guest > memory must be mapped writable, but that quirk doesn't hold true for non-TDX > guests. It should be legal to map private guest memory as read-only. Hm. The point of the change in can_follow_write_pte() is to only allow to write to a PageGuest() page if FOLL_GUEST is used and the mapping is writable. Without the change gup(FOLL_GUEST|FOLL_WRITE) would fail. It doesn't prevent using read-only guest mappings as read-only. But if you want to write to it it has to writable (in addtion to FOLL_GUEST). > And I believe the below snippet in follow_page_pte() will be problematic > too, since FOLL_NUMA is added unless FOLL_FORCE is set. I suspect the > correct approach is to handle FOLL_GUEST as an exception to > pte_protnone(), though that might require adjusting pte_protnone() to be > meaningful even when CONFIG_NUMA_BALANCING=n. > > if ((flags & FOLL_NUMA) && pte_protnone(pte)) > goto no_page; > if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { > pte_unmap_unlock(ptep, ptl); > return NULL; > } Good catch. I'll look into how to untangle NUMA balancing and PageGuest(). It shouldn't be hard. PageGuest() pages should be subject for balancing. > > Do you see any problems with this? > > > > > Oh, and the other nicety is that I think it would avoid having to explicitly > > > handle PageGuest() memory that is being accessed from kernel/KVM, i.e. if all > > > memory exposed to KVM must be !PageGuest(), then it is also eligible for > > > copy_{to,from}_user(). > > > > copy_{to,from}_user() enforce by setting PTE entries to PROT_NONE. > > But KVM does _not_ want those PTEs PROT_NONE. If KVM is accessing memory that > is also accessible by the the guest, then it must be shared. And if it's shared, > it must also be accessible to host userspace, i.e. something other than PROT_NONE, > otherwise the memory isn't actually shared with anything. > > As above, any guest-accessible memory that is accessed by the host must be > shared, and so must be mapped with the required permissions. I don't see contradiction here: copy_{to,from}_user() would fail with -EFAULT on PROT_NONE PTE. By saying in initial posting that inserting PageGuest() into shared is fine, I didn't mean it's usefule, just allowed.
On Thu, Jun 03, 2021, Kirill A. Shutemov wrote: > On Wed, Jun 02, 2021 at 05:51:02PM +0000, Sean Christopherson wrote: > > > Omitting FOLL_GUEST for shared memory doesn't look like a right approach. > > > IIUC, it would require the kernel to track what memory is share and what > > > private, which defeat the purpose of the rework. I would rather enforce > > > !PageGuest() when share SEPT is populated in addition to enforcing > > > PageGuest() fro private SEPT. > > > > Isn't that what omitting FOLL_GUEST would accomplish? For shared memory, > > including mapping memory into the shared EPT, KVM will omit FOLL_GUEST and thus > > require the memory to be readable/writable according to the guest access type. > > Ah. I guess I see what you're saying: we can pipe down the shared bit from > GPA from direct_page_fault() (or whatever handles the fault) down to > hva_to_pfn_slow() and omit FOLL_GUEST if the shared bit is set. Right? Yep. > I guest it's doable, but codeshuffling going to be ugly. It shouldn't be too horrific. If it is horrific, I'd be more than happy to refactor the flow before hand to collect the hva_to_pfn() params into a struct so that adding a "private" flag is less painful. There is already TDX-related work to do similar cleanup in the x86-specific code. https://lkml.kernel.org/r/cover.1618914692.git.isaku.yamahata@intel.com > > By definition, that excludes PageGuest() because PageGuest() pages must always > > be unmapped, e.g. PROTNONE. And for private EPT, because PageGuest() is always > > PROTNONE or whatever, it will require FOLL_GUEST to retrieve the PTE/PMD/Pxx. > > > > On a semi-related topic, I don't think can_follow_write_pte() is the correct > > place to hook PageGuest(). TDX's S-EPT has a quirk where all private guest > > memory must be mapped writable, but that quirk doesn't hold true for non-TDX > > guests. It should be legal to map private guest memory as read-only. > > Hm. The point of the change in can_follow_write_pte() is to only allow to > write to a PageGuest() page if FOLL_GUEST is used and the mapping is > writable. Without the change gup(FOLL_GUEST|FOLL_WRITE) would fail. > > It doesn't prevent using read-only guest mappings as read-only. But if you > want to write to it it has to writable (in addtion to FOLL_GUEST). 100% agree that the page needs to be host-writable to be mapped as writable. What I was pointing out is that if FOLL_WRITE is not set, gup() will never check the PageGuest() exemption (moot point until the protnone check is fixed), and more importantly that the FOLL_GUEST check is orthogonal to the FOLL_WRITE check. In other words, I would expect the code to look something ike: if (PageGuest()) { if (!(flags & FOLL_GUEST)) { pte_unmap_unlock(ptep, ptl); return NULL; } } else if ((flags & FOLL_NUMA) && pte_protnone(pte)) { goto no_page; } > > And I believe the below snippet in follow_page_pte() will be problematic > > too, since FOLL_NUMA is added unless FOLL_FORCE is set. I suspect the > > correct approach is to handle FOLL_GUEST as an exception to > > pte_protnone(), though that might require adjusting pte_protnone() to be > > meaningful even when CONFIG_NUMA_BALANCING=n. > > > > if ((flags & FOLL_NUMA) && pte_protnone(pte)) > > goto no_page; > > if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { > > pte_unmap_unlock(ptep, ptl); > > return NULL; > > } > > Good catch. I'll look into how to untangle NUMA balancing and PageGuest(). > It shouldn't be hard. PageGuest() pages should be subject for balancing. > > > > Do you see any problems with this? > > > > > > > Oh, and the other nicety is that I think it would avoid having to explicitly > > > > handle PageGuest() memory that is being accessed from kernel/KVM, i.e. if all > > > > memory exposed to KVM must be !PageGuest(), then it is also eligible for > > > > copy_{to,from}_user(). > > > > > > copy_{to,from}_user() enforce by setting PTE entries to PROT_NONE. > > > > But KVM does _not_ want those PTEs PROT_NONE. If KVM is accessing memory that > > is also accessible by the the guest, then it must be shared. And if it's shared, > > it must also be accessible to host userspace, i.e. something other than PROT_NONE, > > otherwise the memory isn't actually shared with anything. > > > > As above, any guest-accessible memory that is accessed by the host must be > > shared, and so must be mapped with the required permissions. > > I don't see contradiction here: copy_{to,from}_user() would fail with > -EFAULT on PROT_NONE PTE. > > By saying in initial posting that inserting PageGuest() into shared is > fine, I didn't mean it's usefule, just allowed. Yeah, and I'm saying we should explicitly disallow mapping PageGuest() into shared memory, and then the KVM code that manually kmaps() PageGuest() memory to avoid copy_{to,from}_user() failure goes aways.
On Thu, Jun 03, 2021 at 07:46:52PM +0000, Sean Christopherson wrote: > In other words, I would expect the code to look something ike: > > if (PageGuest()) { > if (!(flags & FOLL_GUEST)) { > pte_unmap_unlock(ptep, ptl); > return NULL; > } > } else if ((flags & FOLL_NUMA) && pte_protnone(pte)) { > goto no_page; > } Okay, looks good. Updated patch is below. I fixed few more bugs. The branch is also updated and rebased to v5.12. > Yeah, and I'm saying we should explicitly disallow mapping PageGuest() into > shared memory, and then the KVM code that manually kmaps() PageGuest() memory > to avoid copy_{to,from}_user() failure goes aways. Manual kmap thing is not needed for TDX: it only required for pure KVM where we handle instruction emulation in the host. ------------------------------8<------------------------------------------ From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Date: Fri, 16 Apr 2021 01:30:48 +0300 Subject: [PATCH] mm: Introduce guest-only pages PageGuest() pages are only allowed to be used as guest memory. Userspace is not allowed read from or write to such pages. On page fault, PageGuest() pages produce PROT_NONE page table entries. Read or write there will trigger SIGBUS. Access to such pages via syscall leads to -EIO. The new mprotect(2) flag PROT_GUEST translates to VM_GUEST. Any page fault to VM_GUEST VMA produces PageGuest() page. Only shared tmpfs/shmem mappings are supported. GUP normally fails on such pages. KVM will use the new FOLL_GUEST flag to access them. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- include/linux/mm.h | 5 ++- include/linux/mman.h | 7 +++- include/linux/page-flags.h | 8 ++++ include/uapi/asm-generic/mman-common.h | 1 + mm/gup.c | 36 ++++++++++++++---- mm/huge_memory.c | 51 +++++++++++++++++++------- mm/memory.c | 28 ++++++++++---- mm/mprotect.c | 18 ++++++++- mm/shmem.c | 12 ++++++ 9 files changed, 133 insertions(+), 33 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8ba434287387..8e679f4d0f21 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -362,6 +362,8 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#define VM_GUEST VM_HIGH_ARCH_4 + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) @@ -413,7 +415,7 @@ extern unsigned int kobjsize(const void *objp); #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif -#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) +#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_GUEST | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the @@ -2793,6 +2795,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_GUEST 0x100000 /* allow access to guest-only pages */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba..204e03d7787c 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -103,7 +103,9 @@ static inline void vm_unacct_memory(long pages) */ static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { - return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0; + int allowed; + allowed = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM | PROT_GUEST; + return (prot & ~allowed) == 0; } #define arch_validate_prot arch_validate_prot #endif @@ -140,7 +142,8 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { return _calc_vm_trans(prot, PROT_READ, VM_READ ) | _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | - _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | + _calc_vm_trans(prot, PROT_EXEC, VM_EXEC ) | + _calc_vm_trans(prot, PROT_GUEST, VM_GUEST) | arch_calc_vm_prot_bits(prot, pkey); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 04a34c08e0a6..4bac0371f5c9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -436,6 +436,14 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif +#if defined(CONFIG_64BIT) && defined(CONFIG_HAVE_KVM_PROTECTED_MEMORY) +PAGEFLAG(Guest, arch_2, PF_HEAD) +TESTSCFLAG(Guest, arch_2, PF_HEAD) +#else +PAGEFLAG_FALSE(Guest) +TESTSCFLAG_FALSE(Guest) +#endif + #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f94f65d429be..c4d985d22b49 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -16,6 +16,7 @@ #define PROT_NONE 0x0 /* page can not be accessed */ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ +#define PROT_GUEST 0x04000000 /* KVM guest memory */ /* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ diff --git a/mm/gup.c b/mm/gup.c index ef7d2da9f03f..2d2d57f70e1f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -356,10 +356,22 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, * FOLL_FORCE can write to even unwritable pte's, but only * after we've gone through a COW cycle and they are dirty. */ -static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +static inline bool can_follow_write_pte(struct vm_area_struct *vma, + pte_t pte, unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + if (pte_write(pte)) + return true; + + if ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)) + return true; + + if (!(flags & FOLL_GUEST)) + return false; + + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) != (VM_WRITE | VM_SHARED)) + return false; + + return PageGuest(pte_page(pte)); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -400,14 +412,20 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, migration_entry_wait(mm, pmd, address); goto retry; } - if ((flags & FOLL_NUMA) && pte_protnone(pte)) + + page = vm_normal_page(vma, address, pte); + if (page && PageGuest(page)) { + if (!(flags & FOLL_GUEST)) + goto no_page; + } else if ((flags & FOLL_NUMA) && pte_protnone(pte)) { goto no_page; - if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { + } + + if ((flags & FOLL_WRITE) && !can_follow_write_pte(vma, pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } - page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN @@ -571,8 +589,12 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); - if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) + if (PageGuest(pmd_page(pmdval))) { + if (!(flags & FOLL_GUEST)) + return no_page_table(vma, flags); + } else if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) { return no_page_table(vma, flags); + } retry_locked: ptl = pmd_lock(mm, pmd); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae907a9c2050..c430a52a3b7f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1336,10 +1336,22 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) * FOLL_FORCE can write to even unwritable pmd's, but only * after we've gone through a COW cycle and they are dirty. */ -static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) +static inline bool can_follow_write_pmd(struct vm_area_struct *vma, + pmd_t pmd, unsigned int flags) { - return pmd_write(pmd) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); + if (pmd_write(pmd)) + return true; + + if ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)) + return true; + + if (!(flags & FOLL_GUEST)) + return false; + + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) != (VM_WRITE | VM_SHARED)) + return false; + + return PageGuest(pmd_page(pmd)); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, @@ -1352,20 +1364,30 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, assert_spin_locked(pmd_lockptr(mm, pmd)); - if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) - goto out; + if (!pmd_present(*pmd)) + return NULL; + + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + + if (PageGuest(page)) { + if (!(flags & FOLL_GUEST)) + return NULL; + } else if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) { + /* + * Full NUMA hinting faults to serialise migration in fault + * paths + */ + return NULL; + } + + if (flags & FOLL_WRITE && !can_follow_write_pmd(vma, *pmd, flags)) + return NULL; /* Avoid dumping huge zero page */ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) return ERR_PTR(-EFAULT); - /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) - goto out; - - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); - if (!try_grab_page(page, flags)) return ERR_PTR(-ENOMEM); @@ -1408,7 +1430,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); -out: return page; } @@ -1426,6 +1447,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) bool was_writable; int flags = 0; + page = pmd_page(pmd); + if (PageGuest(page)) + return VM_FAULT_SIGBUS; + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; diff --git a/mm/memory.c b/mm/memory.c index 550405fc3b5e..d588220feabf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3703,9 +3703,13 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) for (i = 0; i < HPAGE_PMD_NR; i++) flush_icache_page(vma, page + i); - entry = mk_huge_pmd(page, vma->vm_page_prot); - if (write) - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (PageGuest(page)) { + entry = mk_huge_pmd(page, PAGE_NONE); + } else { + entry = mk_huge_pmd(page, vma->vm_page_prot); + if (write) + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + } add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); page_add_file_rmap(page, true); @@ -3741,13 +3745,17 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) pte_t entry; flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); + if (PageGuest(page)) { + entry = mk_pte(page, PAGE_NONE); + } else { + entry = mk_pte(page, vma->vm_page_prot); - if (prefault && arch_wants_old_prefaulted_pte()) - entry = pte_mkold(entry); + if (prefault && arch_wants_old_prefaulted_pte()) + entry = pte_mkold(entry); - if (write) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); @@ -4105,6 +4113,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; + page = pte_page(vmf->orig_pte); + if (PageGuest(page)) + return VM_FAULT_SIGBUS; + /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but diff --git a/mm/mprotect.c b/mm/mprotect.c index 94188df1ee55..aecba46af544 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -484,8 +484,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); - change_protection(vma, start, end, vma->vm_page_prot, - dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + if (vma->vm_flags & VM_GUEST) { + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); + } else { + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + } /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major @@ -603,6 +607,16 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } + if ((newflags & (VM_GUEST|VM_SHARED)) == VM_GUEST) { + error = -EINVAL; + goto out; + } + + if ((newflags & VM_GUEST) && !vma_is_shmem(vma)) { + error = -EINVAL; + goto out; + } + /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..0f44f2fac06c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1835,6 +1835,11 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (page && sgp == SGP_WRITE) mark_page_accessed(page); + if (page && PageGuest(page) && sgp != SGP_CACHE) { + error = -EIO; + goto unlock; + } + /* fallocated page? */ if (page && !PageUptodate(page)) { if (sgp != SGP_READ) @@ -2117,6 +2122,13 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) gfp, vma, vmf, &ret); if (err) return vmf_error(err); + + if ((vmf->vma->vm_flags & VM_GUEST) && !TestSetPageGuest(vmf->page)) { + struct page *head = compound_head(vmf->page); + try_to_unmap(head, TTU_IGNORE_MLOCK); + set_page_dirty(head); + } + return ret; }
On 5/21/21 5:31 AM, Kirill A. Shutemov wrote: > Hi Sean, > > The core patch of the approach we've discussed before is below. It > introduce a new page type with the required semantics. > > The full patchset can be found here: > > git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git kvm-unmapped-guest-only > > but only the patch below is relevant for TDX. QEMU patch is attached. > > CONFIG_HAVE_KVM_PROTECTED_MEMORY has to be changed to what is appropriate > for TDX and FOLL_GUEST has to be used in hva_to_pfn_slow() when running > TDX guest. > > When page get inserted into private sept we must make sure it is > PageGuest() or SIGBUS otherwise. Inserting PageGuest() into shared is > fine, but the page will not be accessible from userspace. I may have missed a detail, and I think Sean almost said this, but: I don't think that a single bit like this is sufficient. A KVM host can contain more than one TDX guest, and, to reliably avoid machine checks, I think we need to make sure that secure pages are only ever mapped into the guest to which they belong, as well as to the right GPA. If a KVM user host process creates a PageGuest page, what stops it from being mapped into two different guests? The refcount? After contemplating this a bit, I have a somewhat different suggestion, at least for TDX. In TDX, a guest logically has two entirely separate physical address spaces: the secure address space and the shared address space. They are managed separately, they have different contents, etc. Normally one would expect a given numerical address (with the secure selector bit masked off) to only exist in one of the two spaces, but nothing in the architecture fundamentally requires this. And switching a page between the secure and shared states is a heavyweight operation. So what if KVM actually treated them completely separately? The secure address space doesn't have any of the complex properties that the shared address space has. There are no paravirt pages there, no KSM pages there, no forked pages there, no MAP_ANONYMOUS pages there, no MMIO pages there, etc. There could be a KVM API to allocate and deallocate a secure page, and that would be it. This could be inefficient if a guest does a bunch of MapGPA calls and makes the shared address space highly sparse. That could potentially be mitigated by allowing a (shared) user memory region to come with a bitmap of which pages are actually mapped. Pages in the unmapped areas are holes, and KVM won't look through to the QEMU page tables. Does this make any sense? It would completely eliminate all this PageGuest stuff -- a secure page would be backed by a *kernel* allocation (potentially a pageable allocation a la tmpfs if TDX ever learns how to swap, but that's no currently possible nor would it be user visible), so the kernel can straightforwardly guarantee that user pagetables will not contain references to secure pages and that GUP will not be called on them. I'm not sure how this would map to SEV. Certainly, with some degree of host vs guest cooperation, SEV could work the same way, and the migration support is pushing SEV in that direction.
On Fri, Jun 04, 2021 at 10:16:49AM -0700, Andy Lutomirski wrote: > On 5/21/21 5:31 AM, Kirill A. Shutemov wrote: > > Hi Sean, > > > > The core patch of the approach we've discussed before is below. It > > introduce a new page type with the required semantics. > > > > The full patchset can be found here: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git kvm-unmapped-guest-only > > > > but only the patch below is relevant for TDX. QEMU patch is attached. > > > > CONFIG_HAVE_KVM_PROTECTED_MEMORY has to be changed to what is appropriate > > for TDX and FOLL_GUEST has to be used in hva_to_pfn_slow() when running > > TDX guest. > > > > When page get inserted into private sept we must make sure it is > > PageGuest() or SIGBUS otherwise. Inserting PageGuest() into shared is > > fine, but the page will not be accessible from userspace. > > I may have missed a detail, and I think Sean almost said this, but: > > I don't think that a single bit like this is sufficient. A KVM host can > contain more than one TDX guest, and, to reliably avoid machine checks, > I think we need to make sure that secure pages are only ever mapped into > the guest to which they belong, as well as to the right GPA. If a KVM > user host process creates a PageGuest page, what stops it from being > mapped into two different guests? The refcount? TDX module ensures that a pfn can only be used once as private memory. > After contemplating this a bit, I have a somewhat different suggestion, > at least for TDX. In TDX, a guest logically has two entirely separate > physical address spaces: the secure address space and the shared address > space. They are managed separately, they have different contents, etc. > Normally one would expect a given numerical address (with the secure > selector bit masked off) to only exist in one of the two spaces, but > nothing in the architecture fundamentally requires this. And switching > a page between the secure and shared states is a heavyweight operation. > > So what if KVM actually treated them completely separately? The secure > address space doesn't have any of the complex properties that the shared > address space has. There are no paravirt pages there, no KSM pages > there, no forked pages there, no MAP_ANONYMOUS pages there, no MMIO > pages there, etc. There could be a KVM API to allocate and deallocate a > secure page, and that would be it. > > This could be inefficient if a guest does a bunch of MapGPA calls and > makes the shared address space highly sparse. That could potentially be > mitigated by allowing a (shared) user memory region to come with a > bitmap of which pages are actually mapped. Pages in the unmapped areas > are holes, and KVM won't look through to the QEMU page tables. > > Does this make any sense? It would completely eliminate all this > PageGuest stuff -- a secure page would be backed by a *kernel* > allocation (potentially a pageable allocation a la tmpfs if TDX ever > learns how to swap, but that's no currently possible nor would it be > user visible), so the kernel can straightforwardly guarantee that user > pagetables will not contain references to secure pages and that GUP will > not be called on them. It doesn't fit in KVM design AFAICS: userspace own guest memory mapping. And it's not future-proof: the private memory has to be subject of Linux memory management in the future. For instance, it should be possible to migrate such memory. Having the memory hiden within kernel makes it difficult: existing API is not suitable to handle it. > I'm not sure how this would map to SEV. Certainly, with some degree of > host vs guest cooperation, SEV could work the same way, and the > migration support is pushing SEV in that direction.
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7ac592664c52..b7db1c455e7c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,6 +46,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU + select HAVE_KVM_PROTECTED_MEMORY help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 38172ca627d3..1457692c1080 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -796,7 +796,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) (1 << KVM_FEATURE_PV_SEND_IPI) | (1 << KVM_FEATURE_POLL_CONTROL) | (1 << KVM_FEATURE_PV_SCHED_YIELD) | - (1 << KVM_FEATURE_ASYNC_PF_INT); + (1 << KVM_FEATURE_ASYNC_PF_INT) | + (1 << KVM_FEATURE_MEM_PROTECTED); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3b97342af026..3fa76693edcd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -43,6 +43,7 @@ #include <linux/hash.h> #include <linux/kern_levels.h> #include <linux/kthread.h> +#include <linux/rmap.h> #include <asm/page.h> #include <asm/memtype.h> @@ -2758,7 +2759,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) if (sp->role.level > PG_LEVEL_4K) return; - __direct_pte_prefetch(vcpu, sp, sptep); + if (!vcpu->kvm->mem_protected) + __direct_pte_prefetch(vcpu, sp, sptep); } static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -3725,6 +3727,14 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; + if (vcpu->kvm->mem_protected && unlikely(!is_noslot_pfn(pfn))) { + if (!kvm_protect_pfn(vcpu->kvm, gfn, pfn)) { + unsigned long hva = kvm_vcpu_gfn_to_hva(vcpu, gfn); + kvm_send_hwpoison_signal(hva, current); + return RET_PF_RETRY; + } + } + r = RET_PF_RETRY; spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 50e268eb8e1a..26b0494a1207 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -397,8 +397,14 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(__get_user(pte, ptep_user))) - goto error; + if (vcpu->kvm->mem_protected) { + if (copy_from_guest(vcpu->kvm, &pte, host_addr + offset, + sizeof(pte))) + goto error; + } else { + if (unlikely(__get_user(pte, ptep_user))) + goto error; + } walker->ptep_user[walker->level - 1] = ptep_user; trace_kvm_mmu_paging_element(pte, walker->level); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b404e4d7dd8..f8183386abe7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8170,6 +8170,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_sched_yield(vcpu->kvm, a0); ret = 0; break; + case KVM_HC_ENABLE_MEM_PROTECTED: + ret = kvm_protect_memory(vcpu->kvm); + break; + case KVM_HC_MEM_SHARE: + ret = kvm_share_memory(vcpu->kvm, a0, a1); + break; default: ret = -KVM_ENOSYS; break; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fadaccb95a4c..cd2374802702 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -436,6 +436,8 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) } #endif +#define KVM_NR_SHARED_RANGES 32 + /* * Note: * memslots are not sorted by id anymore, please use id_to_memslot() @@ -513,6 +515,10 @@ struct kvm { pid_t userspace_pid; unsigned int max_halt_poll_ns; u32 dirty_ring_size; + bool mem_protected; + void *id; + int nr_shared_ranges; + struct range shared_ranges[KVM_NR_SHARED_RANGES]; }; #define kvm_err(fmt, ...) \ @@ -709,6 +715,16 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm); void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); +int kvm_protect_memory(struct kvm *kvm); + +void __kvm_share_memory(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_share_memory(struct kvm *kvm, unsigned long gfn, unsigned long npages); + +bool kvm_protect_pfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn); +void kvm_share_pfn(struct kvm *kvm, kvm_pfn_t pfn); + +bool kvm_page_allowed(struct kvm *kvm, struct page *page); + int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages); @@ -718,6 +734,9 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable); +int copy_from_guest(struct kvm *kvm, void *data, unsigned long hva, int len); +int copy_to_guest(struct kvm *kvm, unsigned long hva, const void *data, int len); + void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index 09d36683ee0a..743e621111f0 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -17,6 +17,7 @@ #define KVM_E2BIG E2BIG #define KVM_EPERM EPERM #define KVM_EOPNOTSUPP 95 +#define KVM_EINTR EINTR #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 519a60d5b6f7..5d05129e8b0c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1224,6 +1224,13 @@ static __always_inline bool free_pages_prepare(struct page *page, trace_mm_page_free(page, order); + if (IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY) && + unlikely(PageHWPoison(page))) { + void kvm_unpoison_page(struct page *page); + + kvm_unpoison_page(page); + } + if (unlikely(PageHWPoison(page)) && !order) { /* * Do not let hwpoison pages hit pcplists/buddy diff --git a/virt/Makefile b/virt/Makefile index 1cfea9436af9..a931049086a3 100644 --- a/virt/Makefile +++ b/virt/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += lib/ +obj-y += lib/ kvm/ diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 1c37ccd5d402..5f28048718f4 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -63,3 +63,7 @@ config HAVE_KVM_NO_POLL config KVM_XFER_TO_GUEST_WORK bool + +config HAVE_KVM_PROTECTED_MEMORY + bool + select MEMORY_FAILURE diff --git a/virt/kvm/Makefile b/virt/kvm/Makefile new file mode 100644 index 000000000000..f10c62b98968 --- /dev/null +++ b/virt/kvm/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_HAVE_KVM_PROTECTED_MEMORY) += mem_protected.o diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3e2dbaef7979..9cbd3716b3e1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -51,6 +51,7 @@ #include <linux/io.h> #include <linux/lockdep.h> #include <linux/kthread.h> +#include <linux/rmap.h> #include <asm/processor.h> #include <asm/ioctl.h> @@ -741,6 +742,7 @@ static struct kvm *kvm_create_vm(unsigned long type) struct kvm *kvm = kvm_arch_alloc_vm(); int r = -ENOMEM; int i; + static long id = 0; if (!kvm) return ERR_PTR(-ENOMEM); @@ -803,6 +805,9 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); + kvm->id = xa_mk_value(id++); + if (id < 0) + id = 0; mutex_unlock(&kvm_lock); preempt_notifier_inc(); @@ -1852,7 +1857,8 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, +static int hva_to_pfn_slow(struct kvm *kvm, unsigned long addr, + bool *async, bool write_fault, bool *writable, kvm_pfn_t *pfn) { unsigned int flags = FOLL_HWPOISON; @@ -1868,11 +1874,17 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, flags |= FOLL_WRITE; if (async) flags |= FOLL_NOWAIT; + if (kvm->mem_protected) + flags |= FOLL_ALLOW_POISONED; npages = get_user_pages_unlocked(addr, 1, &page, flags); if (npages != 1) return npages; + if (IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY) && + kvm->mem_protected && !kvm_page_allowed(kvm, page)) + return -EHWPOISON; + /* map read fault as writable if possible */ if (unlikely(!write_fault) && writable) { struct page *wpage; @@ -1961,8 +1973,9 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, - bool write_fault, bool *writable) +static kvm_pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, + bool atomic, bool *async, + bool write_fault, bool *writable) { struct vm_area_struct *vma; kvm_pfn_t pfn = 0; @@ -1977,7 +1990,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, if (atomic) return KVM_PFN_ERR_FAULT; - npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); + npages = hva_to_pfn_slow(kvm, addr, async, write_fault, writable, &pfn); if (npages == 1) return pfn; @@ -2033,8 +2046,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, writable = NULL; } - return hva_to_pfn(addr, atomic, async, write_fault, - writable); + return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); } EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); @@ -2338,19 +2350,93 @@ static int next_segment(unsigned long len, int offset) return len; } -static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, - void *data, int offset, int len) +int copy_from_guest(struct kvm *kvm, void *data, unsigned long hva, int len) +{ + int offset = offset_in_page(hva); + struct page *page; + int npages, seg; + void *vaddr; + + if (!IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY) || + !kvm->mem_protected) { + return __copy_from_user(data, (void __user *)hva, len); + } + + might_fault(); + kasan_check_write(data, len); + check_object_size(data, len, false); + + while ((seg = next_segment(len, offset)) != 0) { + npages = get_user_pages_unlocked(hva, 1, &page, + FOLL_ALLOW_POISONED); + if (npages != 1) + return -EFAULT; + + if (!kvm_page_allowed(kvm, page)) + return -EFAULT; + + vaddr = kmap_atomic(page); + memcpy(data, vaddr + offset, seg); + kunmap_atomic(vaddr); + + put_page(page); + len -= seg; + hva += seg; + data += seg; + offset = 0; + } + + return 0; +} + +int copy_to_guest(struct kvm *kvm, unsigned long hva, const void *data, int len) +{ + int offset = offset_in_page(hva); + struct page *page; + int npages, seg; + void *vaddr; + + if (!IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY) || + !kvm->mem_protected) { + return __copy_to_user((void __user *)hva, data, len); + } + + might_fault(); + kasan_check_read(data, len); + check_object_size(data, len, true); + + while ((seg = next_segment(len, offset)) != 0) { + npages = get_user_pages_unlocked(hva, 1, &page, + FOLL_WRITE | FOLL_ALLOW_POISONED); + if (npages != 1) + return -EFAULT; + + if (!kvm_page_allowed(kvm, page)) + return -EFAULT; + + vaddr = kmap_atomic(page); + memcpy(vaddr + offset, data, seg); + kunmap_atomic(vaddr); + + put_page(page); + len -= seg; + hva += seg; + data += seg; + offset = 0; + } + + return 0; +} + +static int __kvm_read_guest_page(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, void *data, int offset, int len) { - int r; unsigned long addr; addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; - r = __copy_from_user(data, (void __user *)addr + offset, len); - if (r) - return -EFAULT; - return 0; + return copy_from_guest(kvm, data, addr + offset, len); } int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, @@ -2358,7 +2444,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, { struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - return __kvm_read_guest_page(slot, gfn, data, offset, len); + return __kvm_read_guest_page(kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_page); @@ -2367,7 +2453,7 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __kvm_read_guest_page(slot, gfn, data, offset, len); + return __kvm_read_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); @@ -2449,7 +2535,8 @@ static int __kvm_write_guest_page(struct kvm *kvm, addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; - r = __copy_to_user((void __user *)addr + offset, data, len); + + r = copy_to_guest(kvm, addr + offset, data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(kvm, memslot, gfn); @@ -2586,7 +2673,7 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, if (unlikely(!ghc->memslot)) return kvm_write_guest(kvm, gpa, data, len); - r = __copy_to_user((void __user *)ghc->hva + offset, data, len); + r = copy_to_guest(kvm, ghc->hva + offset, data, len); if (r) return -EFAULT; mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT); @@ -2607,7 +2694,6 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); - int r; gpa_t gpa = ghc->gpa + offset; BUG_ON(len + offset > ghc->len); @@ -2623,11 +2709,7 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, if (unlikely(!ghc->memslot)) return kvm_read_guest(kvm, gpa, data, len); - r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); - if (r) - return -EFAULT; - - return 0; + return copy_from_guest(kvm, data, ghc->hva + offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); @@ -2693,6 +2775,41 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); +int kvm_protect_memory(struct kvm *kvm) +{ + if (mmap_write_lock_killable(kvm->mm)) + return -KVM_EINTR; + + kvm->mem_protected = true; + kvm_arch_flush_shadow_all(kvm); + mmap_write_unlock(kvm->mm); + + return 0; +} + +int kvm_share_memory(struct kvm *kvm, unsigned long gfn, unsigned long npages) +{ + unsigned long end = gfn + npages; + + if (!npages || !IS_ENABLED(CONFIG_HAVE_KVM_PROTECTED_MEMORY)) + return 0; + + __kvm_share_memory(kvm, gfn, end); + + for (; gfn < end; gfn++) { + struct page *page = gfn_to_page(kvm, gfn); + unsigned long pfn = page_to_pfn(page); + + if (page == KVM_ERR_PTR_BAD_PAGE) + continue; + + kvm_share_pfn(kvm, pfn); + put_page(page); + } + + return 0; +} + void kvm_sigset_activate(struct kvm_vcpu *vcpu) { if (!vcpu->sigset_active) diff --git a/virt/kvm/mem_protected.c b/virt/kvm/mem_protected.c new file mode 100644 index 000000000000..81882bd3232b --- /dev/null +++ b/virt/kvm/mem_protected.c @@ -0,0 +1,110 @@ +#include <linux/kvm_host.h> +#include <linux/mm.h> +#include <linux/rmap.h> + +static DEFINE_XARRAY(kvm_pfn_map); + +static bool gfn_is_shared(struct kvm *kvm, unsigned long gfn) +{ + bool ret = false; + int i; + + spin_lock(&kvm->mmu_lock); + for (i = 0; i < kvm->nr_shared_ranges; i++) { + if (gfn < kvm->shared_ranges[i].start) + continue; + if (gfn >= kvm->shared_ranges[i].end) + continue; + + ret = true; + break; + } + spin_unlock(&kvm->mmu_lock); + + return ret; +} + +bool kvm_protect_pfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + bool ret = true; + + if (gfn_is_shared(kvm, gfn)) + return true; + + if (is_zero_pfn(pfn)) + return true; + + /* Only anonymous and shmem/tmpfs pages are supported */ + if (!PageSwapBacked(page)) + return false; + + lock_page(page); + + /* Recheck gfn_is_shared() under page lock */ + if (gfn_is_shared(kvm, gfn)) + goto out; + + if (!TestSetPageHWPoison(page)) { + try_to_unmap(page, TTU_IGNORE_MLOCK); + xa_store(&kvm_pfn_map, pfn, kvm->id, GFP_KERNEL); + } else if (xa_load(&kvm_pfn_map, pfn) != kvm->id) { + ret = false; + } +out: + unlock_page(page); + return ret; +} +EXPORT_SYMBOL_GPL(kvm_protect_pfn); + +void __kvm_share_memory(struct kvm *kvm, + unsigned long start, unsigned long end) +{ + /* + * Out of slots. + * Still worth to proceed: the new range may merge with an existing + * one. + */ + WARN_ON_ONCE(kvm->nr_shared_ranges == ARRAY_SIZE(kvm->shared_ranges)); + + spin_lock(&kvm->mmu_lock); + kvm->nr_shared_ranges = add_range_with_merge(kvm->shared_ranges, + ARRAY_SIZE(kvm->shared_ranges), + kvm->nr_shared_ranges, + start, end); + kvm->nr_shared_ranges = clean_sort_range(kvm->shared_ranges, + ARRAY_SIZE(kvm->shared_ranges)); + spin_unlock(&kvm->mmu_lock); +} +EXPORT_SYMBOL(__kvm_share_memory); + +void kvm_share_pfn(struct kvm *kvm, kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + + lock_page(page); + if (xa_load(&kvm_pfn_map, pfn) == kvm->id) { + xa_erase(&kvm_pfn_map, pfn); + ClearPageHWPoison(page); + } + unlock_page(page); +} +EXPORT_SYMBOL(kvm_share_pfn); + +void kvm_unpoison_page(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + if (xa_erase(&kvm_pfn_map, pfn)) + ClearPageHWPoison(page); +} + +bool kvm_page_allowed(struct kvm *kvm, struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + if (!PageHWPoison(page)) + return true; + + return xa_load(&kvm_pfn_map, pfn) == kvm->id; +}