Message ID | 97bb1f2996d8a7b828cd9e3309380d1a86ca681b.1705965635.git.isaku.yamahata@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM TDX basic feature support | expand |
On Mon, Jan 22, 2024, isaku.yamahata@intel.com wrote: > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 4cbcedff4f16..1a5a91b99de9 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -591,6 +591,69 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > return 0; > } > > +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, > + enum pg_level level, kvm_pfn_t pfn) > +{ > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + hpa_t hpa = pfn_to_hpa(pfn); > + gpa_t gpa = gfn_to_gpa(gfn); > + struct tdx_module_args out; > + hpa_t source_pa; > + bool measure; > + u64 err; > + int i; > + > + /* > + * KVM_INIT_MEM_REGION, tdx_init_mem_region(), supports only 4K page > + * because tdh_mem_page_add() supports only 4K page. > + */ > + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > + return -EINVAL; > + > + /* > + * In case of TDP MMU, fault handler can run concurrently. Note > + * 'source_pa' is a TD scope variable, meaning if there are multiple > + * threads reaching here with all needing to access 'source_pa', it > + * will break. However fortunately this won't happen, because below > + * TDH_MEM_PAGE_ADD code path is only used when VM is being created > + * before it is running, using KVM_TDX_INIT_MEM_REGION ioctl (which > + * always uses vcpu 0's page table and protected by vcpu->mutex). > + */ Most of the above is superflous. tdx_mem_page_add() is called if and only if the TD is finalized, and the TDX module disallow running vCPUs before the TD is finalized. That's it. And maybe throw in a lockdep to assert that kvm->lock is held. > + if (KVM_BUG_ON(kvm_tdx->source_pa == INVALID_PAGE, kvm)) { > + tdx_unpin(kvm, pfn); > + return -EINVAL; > + } > + > + source_pa = kvm_tdx->source_pa & ~KVM_TDX_MEASURE_MEMORY_REGION; > + measure = kvm_tdx->source_pa & KVM_TDX_MEASURE_MEMORY_REGION; > + kvm_tdx->source_pa = INVALID_PAGE; > + > + do { > + err = tdh_mem_page_add(kvm_tdx->tdr_pa, gpa, hpa, source_pa, > + &out); > + /* > + * This path is executed during populating initial guest memory > + * image. i.e. before running any vcpu. Race is rare. How are races possible at all? > + */ > + } while (unlikely(err == TDX_ERROR_SEPT_BUSY)); > + if (KVM_BUG_ON(err, kvm)) { > + pr_tdx_error(TDH_MEM_PAGE_ADD, err, &out); > + tdx_unpin(kvm, pfn); > + return -EIO; > + } else if (measure) { > + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { > + err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &out); > + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { > + pr_tdx_error(TDH_MR_EXTEND, err, &out); > + break; > + } > + } Why is measurement done deep within the MMU? At a glance, I don't see why this can't be done up in the ioctl, outside of a spinlock. And IIRC, the order affects the measurement but doesn't truly matter, e.g. KVM could choose to completely separate tdh_mr_extend() from tdh_mem_page_add(), no? > +static int tdx_init_mem_region(struct kvm *kvm, struct kvm_tdx_cmd *cmd) > +{ > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + struct kvm_tdx_init_mem_region region; > + struct kvm_vcpu *vcpu; > + struct page *page; > + int idx, ret = 0; > + bool added = false; > + > + /* Once TD is finalized, the initial guest memory is fixed. */ > + if (is_td_finalized(kvm_tdx)) > + return -EINVAL; > + > + /* The BSP vCPU must be created before initializing memory regions. */ > + if (!atomic_read(&kvm->online_vcpus)) > + return -EINVAL; > + > + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) > + return -EINVAL; > + > + if (copy_from_user(®ion, (void __user *)cmd->data, sizeof(region))) > + return -EFAULT; > + > + /* Sanity check */ > + if (!IS_ALIGNED(region.source_addr, PAGE_SIZE) || > + !IS_ALIGNED(region.gpa, PAGE_SIZE) || > + !region.nr_pages || > + region.nr_pages & GENMASK_ULL(63, 63 - PAGE_SHIFT) || > + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || > + !kvm_is_private_gpa(kvm, region.gpa) || > + !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT))) > + return -EINVAL; > + > + vcpu = kvm_get_vcpu(kvm, 0); > + if (mutex_lock_killable(&vcpu->mutex)) > + return -EINTR; The real reason for this drive-by pseudo-review is that I am hoping/wishing we can turn this into a generic KVM ioctl() to allow userspace to pre-map guest memory[*]. If we're going to carry non-trivial code, we might as well squeeze as much use out of it as we can. Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why this is a VM ioctl() and not a vCPU ioctl()? Very roughly, couldn't we use a struct like this as input to a vCPU ioctl() that maps memory, and optionally initializes memory from @source? struct kvm_memory_mapping { __u64 base_gfn; __u64 nr_pages; __u64 flags; __u64 source; } TDX would need to do special things for copying the source, but beyond that most of the code in this function is generic. [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it
+Vipin Sharma On Wed, Jan 31, 2024 at 4:21 PM Sean Christopherson <seanjc@google.com> wrote: > On Mon, Jan 22, 2024, isaku.yamahata@intel.com wrote: > > The real reason for this drive-by pseudo-review is that I am hoping/wishing we > can turn this into a generic KVM ioctl() to allow userspace to pre-map guest > memory[*]. > > If we're going to carry non-trivial code, we might as well squeeze as much use > out of it as we can. > > Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why > this is a VM ioctl() and not a vCPU ioctl()? Very roughly, couldn't we use a > struct like this as input to a vCPU ioctl() that maps memory, and optionally > initializes memory from @source? > > struct kvm_memory_mapping { > __u64 base_gfn; > __u64 nr_pages; > __u64 flags; > __u64 source; > } > > TDX would need to do special things for copying the source, but beyond that most > of the code in this function is generic. > > [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it We would also be interested in such an API to reduce the guest performance impact of intra-host migration.
On 1/23/2024 7:53 AM, isaku.yamahata@intel.com wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > Because the guest memory is protected in TDX, the creation of the initial > guest memory requires a dedicated TDX module API, tdh_mem_page_add, instead > of directly copying the memory contents into the guest memory in the case > of the default VM type. KVM MMU page fault handler callback, > private_page_add, handles it. The changelog is stale? Do you mean "set_private_spte"? > > Define new subcommand, KVM_TDX_INIT_MEM_REGION, of VM-scoped > KVM_MEMORY_ENCRYPT_OP. It assigns the guest page, copies the initial > memory contents into the guest memory, encrypts the guest memory. At the > same time, optionally it extends memory measurement of the TDX guest. It > calls the KVM MMU page fault(EPT-violation) handler to trigger the > callbacks for it. > > Reported-by: gkirkpatrick@google.com > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > > --- > v18: > - rename tdx_sept_page_add() -> tdx_mem_page_add(). > - open code tdx_measure_page() into tdx_mem_page_add(). > - remove the change of tools/arch/x86/include/uapi/asm/kvm.h. > > v15 -> v16: > - add check if nr_pages isn't large with > (nr_page << PAGE_SHIFT) >> PAGE_SHIFT > > v14 -> v15: > - add a check if TD is finalized or not to tdx_init_mem_region() > - return -EAGAIN when partial population > --- > arch/x86/include/uapi/asm/kvm.h | 9 ++ > arch/x86/kvm/mmu/mmu.c | 1 + > arch/x86/kvm/vmx/tdx.c | 160 +++++++++++++++++++++++++++++++- > arch/x86/kvm/vmx/tdx.h | 2 + > 4 files changed, 169 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h > index 4000a2e087a8..9fda7c90b7b5 100644 > --- a/arch/x86/include/uapi/asm/kvm.h > +++ b/arch/x86/include/uapi/asm/kvm.h > @@ -572,6 +572,7 @@ enum kvm_tdx_cmd_id { > KVM_TDX_CAPABILITIES = 0, > KVM_TDX_INIT_VM, > KVM_TDX_INIT_VCPU, > + KVM_TDX_INIT_MEM_REGION, > > KVM_TDX_CMD_NR_MAX, > }; > @@ -649,4 +650,12 @@ struct kvm_tdx_init_vm { > struct kvm_cpuid2 cpuid; > }; > > +#define KVM_TDX_MEASURE_MEMORY_REGION (1UL << 0) > + > +struct kvm_tdx_init_mem_region { > + __u64 source_addr; > + __u64 gpa; > + __u64 nr_pages; > +}; > + > #endif /* _ASM_X86_KVM_H */ > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 26d215e85b76..fc258f112e73 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -5663,6 +5663,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) > out: > return r; > } > +EXPORT_SYMBOL(kvm_mmu_load); > > void kvm_mmu_unload(struct kvm_vcpu *vcpu) > { > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 4cbcedff4f16..1a5a91b99de9 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -591,6 +591,69 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > return 0; > } > > +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, > + enum pg_level level, kvm_pfn_t pfn) > +{ > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + hpa_t hpa = pfn_to_hpa(pfn); > + gpa_t gpa = gfn_to_gpa(gfn); > + struct tdx_module_args out; > + hpa_t source_pa; > + bool measure; > + u64 err; > + int i; > + > + /* > + * KVM_INIT_MEM_REGION, tdx_init_mem_region(), supports only 4K page > + * because tdh_mem_page_add() supports only 4K page. > + */ > + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > + return -EINVAL; > + > + /* > + * In case of TDP MMU, fault handler can run concurrently. Note > + * 'source_pa' is a TD scope variable, meaning if there are multiple > + * threads reaching here with all needing to access 'source_pa', it > + * will break. However fortunately this won't happen, because below > + * TDH_MEM_PAGE_ADD code path is only used when VM is being created > + * before it is running, using KVM_TDX_INIT_MEM_REGION ioctl (which > + * always uses vcpu 0's page table and protected by vcpu->mutex). > + */ > + if (KVM_BUG_ON(kvm_tdx->source_pa == INVALID_PAGE, kvm)) { > + tdx_unpin(kvm, pfn); > + return -EINVAL; > + } > + > + source_pa = kvm_tdx->source_pa & ~KVM_TDX_MEASURE_MEMORY_REGION; > + measure = kvm_tdx->source_pa & KVM_TDX_MEASURE_MEMORY_REGION; > + kvm_tdx->source_pa = INVALID_PAGE; > + > + do { > + err = tdh_mem_page_add(kvm_tdx->tdr_pa, gpa, hpa, source_pa, > + &out); > + /* > + * This path is executed during populating initial guest memory > + * image. i.e. before running any vcpu. Race is rare. > + */ > + } while (unlikely(err == TDX_ERROR_SEPT_BUSY)); For page add, since pages are added one by one, there should be no such error, right? > + if (KVM_BUG_ON(err, kvm)) { > + pr_tdx_error(TDH_MEM_PAGE_ADD, err, &out); > + tdx_unpin(kvm, pfn); > + return -EIO; > + } else if (measure) { > + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { > + err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &out); > + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { > + pr_tdx_error(TDH_MR_EXTEND, err, &out); > + break; > + } > + } > + } > + > + return 0; > + > +} > + > static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, > enum pg_level level, kvm_pfn_t pfn) > { > @@ -613,9 +676,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, > if (likely(is_td_finalized(kvm_tdx))) > return tdx_mem_page_aug(kvm, gfn, level, pfn); > > - /* TODO: tdh_mem_page_add() comes here for the initial memory. */ > - > - return 0; > + return tdx_mem_page_add(kvm, gfn, level, pfn); > } > > static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, > @@ -1322,6 +1383,96 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) > tdx_track(vcpu->kvm); > } > > +#define TDX_SEPT_PFERR (PFERR_WRITE_MASK | PFERR_GUEST_ENC_MASK) > + > +static int tdx_init_mem_region(struct kvm *kvm, struct kvm_tdx_cmd *cmd) > +{ > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + struct kvm_tdx_init_mem_region region; > + struct kvm_vcpu *vcpu; > + struct page *page; > + int idx, ret = 0; > + bool added = false; > + > + /* Once TD is finalized, the initial guest memory is fixed. */ > + if (is_td_finalized(kvm_tdx)) > + return -EINVAL; > + > + /* The BSP vCPU must be created before initializing memory regions. */ > + if (!atomic_read(&kvm->online_vcpus)) > + return -EINVAL; > + > + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) > + return -EINVAL; > + > + if (copy_from_user(®ion, (void __user *)cmd->data, sizeof(region))) > + return -EFAULT; > + > + /* Sanity check */ > + if (!IS_ALIGNED(region.source_addr, PAGE_SIZE) || > + !IS_ALIGNED(region.gpa, PAGE_SIZE) || > + !region.nr_pages || > + region.nr_pages & GENMASK_ULL(63, 63 - PAGE_SHIFT) || > + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || > + !kvm_is_private_gpa(kvm, region.gpa) || > + !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT))) > + return -EINVAL; > + > + vcpu = kvm_get_vcpu(kvm, 0); > + if (mutex_lock_killable(&vcpu->mutex)) > + return -EINTR; > + > + vcpu_load(vcpu); > + idx = srcu_read_lock(&kvm->srcu); > + > + kvm_mmu_reload(vcpu); > + > + while (region.nr_pages) { > + if (signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + > + if (need_resched()) > + cond_resched(); > + > + /* Pin the source page. */ > + ret = get_user_pages_fast(region.source_addr, 1, 0, &page); > + if (ret < 0) > + break; > + if (ret != 1) { > + ret = -ENOMEM; > + break; > + } > + > + kvm_tdx->source_pa = pfn_to_hpa(page_to_pfn(page)) | > + (cmd->flags & KVM_TDX_MEASURE_MEMORY_REGION); > + > + ret = kvm_mmu_map_tdp_page(vcpu, region.gpa, TDX_SEPT_PFERR, > + PG_LEVEL_4K); > + put_page(page); > + if (ret) > + break; > + > + region.source_addr += PAGE_SIZE; > + region.gpa += PAGE_SIZE; > + region.nr_pages--; > + added = true; > + } > + > + srcu_read_unlock(&kvm->srcu, idx); > + vcpu_put(vcpu); > + > + mutex_unlock(&vcpu->mutex); > + > + if (added && region.nr_pages > 0) > + ret = -EAGAIN; > + if (copy_to_user((void __user *)cmd->data, ®ion, sizeof(region))) > + ret = -EFAULT; > + > + return ret; > +} > + > int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) > { > struct kvm_tdx_cmd tdx_cmd; > @@ -1341,6 +1492,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) > case KVM_TDX_INIT_VM: > r = tdx_td_init(kvm, &tdx_cmd); > break; > + case KVM_TDX_INIT_MEM_REGION: > + r = tdx_init_mem_region(kvm, &tdx_cmd); > + break; > default: > r = -EINVAL; > goto out; > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h > index 783ce329d7da..d589a2caedfb 100644 > --- a/arch/x86/kvm/vmx/tdx.h > +++ b/arch/x86/kvm/vmx/tdx.h > @@ -17,6 +17,8 @@ struct kvm_tdx { > u64 xfam; > int hkid; > > + hpa_t source_pa; > + > bool finalized; > atomic_t tdh_mem_track; >
On Thu, Feb 01, 2024 at 03:06:46PM -0800, David Matlack <dmatlack@google.com> wrote: > +Vipin Sharma > > On Wed, Jan 31, 2024 at 4:21 PM Sean Christopherson <seanjc@google.com> wrote: > > On Mon, Jan 22, 2024, isaku.yamahata@intel.com wrote: > > > > The real reason for this drive-by pseudo-review is that I am hoping/wishing we > > can turn this into a generic KVM ioctl() to allow userspace to pre-map guest > > memory[*]. > > > > If we're going to carry non-trivial code, we might as well squeeze as much use > > out of it as we can. > > > > Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why > > this is a VM ioctl() and not a vCPU ioctl()? Very roughly, couldn't we use a > > struct like this as input to a vCPU ioctl() that maps memory, and optionally > > initializes memory from @source? > > > > struct kvm_memory_mapping { > > __u64 base_gfn; > > __u64 nr_pages; > > __u64 flags; > > __u64 source; > > } > > > > TDX would need to do special things for copying the source, but beyond that most > > of the code in this function is generic. > > > > [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it > > We would also be interested in such an API to reduce the guest > performance impact of intra-host migration. I introduce KVM_MEMORY_MAPPING and KVM_CAP_MEMORY_MAPPING with v19. We can continue the discussion there.
On Mon, Feb 26, 2024, Isaku Yamahata wrote: > On Thu, Feb 01, 2024 at 03:06:46PM -0800, > David Matlack <dmatlack@google.com> wrote: > > > +Vipin Sharma > > > > On Wed, Jan 31, 2024 at 4:21 PM Sean Christopherson <seanjc@google.com> wrote: > > > On Mon, Jan 22, 2024, isaku.yamahata@intel.com wrote: > > > > > > The real reason for this drive-by pseudo-review is that I am hoping/wishing we > > > can turn this into a generic KVM ioctl() to allow userspace to pre-map guest > > > memory[*]. > > > > > > If we're going to carry non-trivial code, we might as well squeeze as much use > > > out of it as we can. > > > > > > Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why > > > this is a VM ioctl() and not a vCPU ioctl()? Very roughly, couldn't we use a > > > struct like this as input to a vCPU ioctl() that maps memory, and optionally > > > initializes memory from @source? > > > > > > struct kvm_memory_mapping { > > > __u64 base_gfn; > > > __u64 nr_pages; > > > __u64 flags; > > > __u64 source; > > > } > > > > > > TDX would need to do special things for copying the source, but beyond that most > > > of the code in this function is generic. > > > > > > [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it > > > > We would also be interested in such an API to reduce the guest > > performance impact of intra-host migration. > > I introduce KVM_MEMORY_MAPPING and KVM_CAP_MEMORY_MAPPING with v19. KVM_MEMORY_MAPPING is not a good ioctl() name. There needs to be a verb in there somewhere, e.g. KVM_MAP_MEMORY, KVM_FAULTIN_MEMORY, etc. > We can continue the discussion there. No, we absolutely cannot continue the conversation there. That is not how kernel development works. Enough is enough. I am archiving v19 and not touching it. Please post an RFC for _just_ this functionality, and follow-up in existing, pre-v19 conversations for anything else that changed between v18 and v19 and might need additional input/discussion.
On Mon, Feb 26, 2024 at 10:50:55AM -0800, Sean Christopherson <seanjc@google.com> wrote: > Please post an RFC for _just_ this functionality, and follow-up in existing, > pre-v19 conversations for anything else that changed between v18 and v19 and might > need additional input/discussion. Sure, will post it. My plan is as follow for input/discussion - Review SEV-SNP patches by Paolo for commonality - RFC patch to KVM_MAP_MEMORY or KVM_FAULTIN_MEMORY - RFC patch for uKVM for confidential VM
On Tue, Feb 27, 2024, Isaku Yamahata wrote: > On Mon, Feb 26, 2024 at 10:50:55AM -0800, > Sean Christopherson <seanjc@google.com> wrote: > > > Please post an RFC for _just_ this functionality, and follow-up in existing, > > pre-v19 conversations for anything else that changed between v18 and v19 and might > > need additional input/discussion. > > Sure, will post it. My plan is as follow for input/discussion > - Review SEV-SNP patches by Paolo for commonality > - RFC patch to KVM_MAP_MEMORY or KVM_FAULTIN_MEMORY > - RFC patch for uKVM for confidential VM uKVM?
On Tue, Feb 27, 2024 at 09:30:11AM -0800, Sean Christopherson <seanjc@google.com> wrote: > On Tue, Feb 27, 2024, Isaku Yamahata wrote: > > On Mon, Feb 26, 2024 at 10:50:55AM -0800, > > Sean Christopherson <seanjc@google.com> wrote: > > > > > Please post an RFC for _just_ this functionality, and follow-up in existing, > > > pre-v19 conversations for anything else that changed between v18 and v19 and might > > > need additional input/discussion. > > > > Sure, will post it. My plan is as follow for input/discussion > > - Review SEV-SNP patches by Paolo for commonality > > - RFC patch to KVM_MAP_MEMORY or KVM_FAULTIN_MEMORY > > - RFC patch for uKVM for confidential VM > > uKVM? I meant uAPI, sorry for typo. Although I looked into a unified uAPI with SEV, the gain seem to be small or none. I'm currently planning to drop it based on the feedback at https://lore.kernel.org/kvm/ZL%2Fr6Vca8WkFVaic@google.com/
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 4000a2e087a8..9fda7c90b7b5 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -572,6 +572,7 @@ enum kvm_tdx_cmd_id { KVM_TDX_CAPABILITIES = 0, KVM_TDX_INIT_VM, KVM_TDX_INIT_VCPU, + KVM_TDX_INIT_MEM_REGION, KVM_TDX_CMD_NR_MAX, }; @@ -649,4 +650,12 @@ struct kvm_tdx_init_vm { struct kvm_cpuid2 cpuid; }; +#define KVM_TDX_MEASURE_MEMORY_REGION (1UL << 0) + +struct kvm_tdx_init_mem_region { + __u64 source_addr; + __u64 gpa; + __u64 nr_pages; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 26d215e85b76..fc258f112e73 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5663,6 +5663,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } +EXPORT_SYMBOL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 4cbcedff4f16..1a5a91b99de9 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -591,6 +591,69 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, return 0; } +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + hpa_t hpa = pfn_to_hpa(pfn); + gpa_t gpa = gfn_to_gpa(gfn); + struct tdx_module_args out; + hpa_t source_pa; + bool measure; + u64 err; + int i; + + /* + * KVM_INIT_MEM_REGION, tdx_init_mem_region(), supports only 4K page + * because tdh_mem_page_add() supports only 4K page. + */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + /* + * In case of TDP MMU, fault handler can run concurrently. Note + * 'source_pa' is a TD scope variable, meaning if there are multiple + * threads reaching here with all needing to access 'source_pa', it + * will break. However fortunately this won't happen, because below + * TDH_MEM_PAGE_ADD code path is only used when VM is being created + * before it is running, using KVM_TDX_INIT_MEM_REGION ioctl (which + * always uses vcpu 0's page table and protected by vcpu->mutex). + */ + if (KVM_BUG_ON(kvm_tdx->source_pa == INVALID_PAGE, kvm)) { + tdx_unpin(kvm, pfn); + return -EINVAL; + } + + source_pa = kvm_tdx->source_pa & ~KVM_TDX_MEASURE_MEMORY_REGION; + measure = kvm_tdx->source_pa & KVM_TDX_MEASURE_MEMORY_REGION; + kvm_tdx->source_pa = INVALID_PAGE; + + do { + err = tdh_mem_page_add(kvm_tdx->tdr_pa, gpa, hpa, source_pa, + &out); + /* + * This path is executed during populating initial guest memory + * image. i.e. before running any vcpu. Race is rare. + */ + } while (unlikely(err == TDX_ERROR_SEPT_BUSY)); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MEM_PAGE_ADD, err, &out); + tdx_unpin(kvm, pfn); + return -EIO; + } else if (measure) { + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &out); + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { + pr_tdx_error(TDH_MR_EXTEND, err, &out); + break; + } + } + } + + return 0; + +} + static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn) { @@ -613,9 +676,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, if (likely(is_td_finalized(kvm_tdx))) return tdx_mem_page_aug(kvm, gfn, level, pfn); - /* TODO: tdh_mem_page_add() comes here for the initial memory. */ - - return 0; + return tdx_mem_page_add(kvm, gfn, level, pfn); } static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, @@ -1322,6 +1383,96 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) tdx_track(vcpu->kvm); } +#define TDX_SEPT_PFERR (PFERR_WRITE_MASK | PFERR_GUEST_ENC_MASK) + +static int tdx_init_mem_region(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_mem_region region; + struct kvm_vcpu *vcpu; + struct page *page; + int idx, ret = 0; + bool added = false; + + /* Once TD is finalized, the initial guest memory is fixed. */ + if (is_td_finalized(kvm_tdx)) + return -EINVAL; + + /* The BSP vCPU must be created before initializing memory regions. */ + if (!atomic_read(&kvm->online_vcpus)) + return -EINVAL; + + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) + return -EINVAL; + + if (copy_from_user(®ion, (void __user *)cmd->data, sizeof(region))) + return -EFAULT; + + /* Sanity check */ + if (!IS_ALIGNED(region.source_addr, PAGE_SIZE) || + !IS_ALIGNED(region.gpa, PAGE_SIZE) || + !region.nr_pages || + region.nr_pages & GENMASK_ULL(63, 63 - PAGE_SHIFT) || + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || + !kvm_is_private_gpa(kvm, region.gpa) || + !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT))) + return -EINVAL; + + vcpu = kvm_get_vcpu(kvm, 0); + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; + + vcpu_load(vcpu); + idx = srcu_read_lock(&kvm->srcu); + + kvm_mmu_reload(vcpu); + + while (region.nr_pages) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + if (need_resched()) + cond_resched(); + + /* Pin the source page. */ + ret = get_user_pages_fast(region.source_addr, 1, 0, &page); + if (ret < 0) + break; + if (ret != 1) { + ret = -ENOMEM; + break; + } + + kvm_tdx->source_pa = pfn_to_hpa(page_to_pfn(page)) | + (cmd->flags & KVM_TDX_MEASURE_MEMORY_REGION); + + ret = kvm_mmu_map_tdp_page(vcpu, region.gpa, TDX_SEPT_PFERR, + PG_LEVEL_4K); + put_page(page); + if (ret) + break; + + region.source_addr += PAGE_SIZE; + region.gpa += PAGE_SIZE; + region.nr_pages--; + added = true; + } + + srcu_read_unlock(&kvm->srcu, idx); + vcpu_put(vcpu); + + mutex_unlock(&vcpu->mutex); + + if (added && region.nr_pages > 0) + ret = -EAGAIN; + if (copy_to_user((void __user *)cmd->data, ®ion, sizeof(region))) + ret = -EFAULT; + + return ret; +} + int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_tdx_cmd tdx_cmd; @@ -1341,6 +1492,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) case KVM_TDX_INIT_VM: r = tdx_td_init(kvm, &tdx_cmd); break; + case KVM_TDX_INIT_MEM_REGION: + r = tdx_init_mem_region(kvm, &tdx_cmd); + break; default: r = -EINVAL; goto out; diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 783ce329d7da..d589a2caedfb 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -17,6 +17,8 @@ struct kvm_tdx { u64 xfam; int hkid; + hpa_t source_pa; + bool finalized; atomic_t tdh_mem_track;