Message ID | 20240904030751.117579-15-rick.p.edgecombe@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | TDX MMU Part 2 | expand |
> --- a/arch/x86/kvm/vmx/main.c > +++ b/arch/x86/kvm/vmx/main.c > @@ -36,9 +36,21 @@ static __init int vt_hardware_setup(void) > * is KVM may allocate couple of more bytes than needed for > * each VM. > */ > - if (enable_tdx) > + if (enable_tdx) { > vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, > sizeof(struct kvm_tdx)); > + /* > + * Note, TDX may fail to initialize in a later time in > + * vt_init(), in which case it is not necessary to setup > + * those callbacks. But making them valid here even > + * when TDX fails to init later is fine because those > + * callbacks won't be called if the VM isn't TDX guest. > + */ > + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; > + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; > + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; > + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; Nit: The callbacks in 'struct kvm_x86_ops' have name "external", but TDX callbacks have name "private". Should we rename TDX callbacks to make them aligned? > + } > > return 0; > } > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 6feb3ab96926..b8cd5a629a80 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -447,6 +447,177 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) > td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); > } > > +static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn) > +{ > + struct page *page = pfn_to_page(pfn); > + > + put_page(page); > +} > + > +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > + enum pg_level level, kvm_pfn_t pfn) > +{ > + int tdx_level = pg_level_to_tdx_sept_level(level); > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + hpa_t hpa = pfn_to_hpa(pfn); > + gpa_t gpa = gfn_to_gpa(gfn); > + u64 entry, level_state; > + u64 err; > + > + err = tdh_mem_page_aug(kvm_tdx, gpa, hpa, &entry, &level_state); > + if (unlikely(err == TDX_ERROR_SEPT_BUSY)) { > + tdx_unpin(kvm, pfn); > + return -EAGAIN; > + } Nit: Here (and other non-fatal error cases) I think we should return -EBUSY to make it consistent with non-TDX case? E.g., the non-TDX case has: if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; And the comment of tdp_mmu_set_spte_atomic() currently says it can only return 0 or -EBUSY. It needs to be patched to reflect it can also return other non-0 errors like -EIO but those are fatal. In terms of non-fatal error I don't think we need another -EAGAIN. /* * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically [...] * Return: * * 0 - If the SPTE was set. * * -EBUSY - If the SPTE cannot be set. In this case this function will * have no side-effects other than setting iter->old_spte to * the last known value of the spte. */ [...] > + > +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, > + enum pg_level level, kvm_pfn_t pfn) > +{ > [...] > + > + hpa_with_hkid = set_hkid_to_hpa(hpa, (u16)kvm_tdx->hkid); > + do { > + /* > + * TDX_OPERAND_BUSY can happen on locking PAMT entry. Because > + * this page was removed above, other thread shouldn't be > + * repeatedly operating on this page. Just retry loop. > + */ > + err = tdh_phymem_page_wbinvd(hpa_with_hkid); > + } while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX))); In what case(s) other threads can concurrently lock the PAMT entry, leading to the above BUSY? [...] > + > +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, > + enum pg_level level, kvm_pfn_t pfn) > +{ > + int ret; > + > + /* > + * HKID is released when vm_free() which is after closing gmem_fd From latest dev branch HKID is freed from vt_vm_destroy(), but not vm_free() (which should be tdx_vm_free() btw). static void vt_vm_destroy(struct kvm *kvm) { if (is_td(kvm)) return tdx_mmu_release_hkid(kvm); vmx_vm_destroy(kvm); } Btw, why not have a tdx_vm_destroy() wrapper? Seems all other vt_xx()s have a tdx_xx() but only this one calls tdx_mmu_release_hkid() directly. > + * which causes gmem invalidation to zap all spte. > + * Population is only allowed after KVM_TDX_INIT_VM. > + */ What does the second sentence ("Population ...") meaning? Why is it relevant here?
On Fri, 2024-09-06 at 14:10 +1200, Huang, Kai wrote: > > > --- a/arch/x86/kvm/vmx/main.c > > +++ b/arch/x86/kvm/vmx/main.c > > @@ -36,9 +36,21 @@ static __init int vt_hardware_setup(void) > > * is KVM may allocate couple of more bytes than needed for > > * each VM. > > */ > > - if (enable_tdx) > > + if (enable_tdx) { > > vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, > > sizeof(struct kvm_tdx)); > > + /* > > + * Note, TDX may fail to initialize in a later time in > > + * vt_init(), in which case it is not necessary to setup > > + * those callbacks. But making them valid here even > > + * when TDX fails to init later is fine because those > > + * callbacks won't be called if the VM isn't TDX guest. > > + */ > > + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; > > + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; > > + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; > > + vt_x86_ops.remove_external_spte = > > tdx_sept_remove_private_spte; > > Nit: The callbacks in 'struct kvm_x86_ops' have name "external", but > TDX callbacks have name "private". Should we rename TDX callbacks to > make them aligned? "external" is the core MMU naming abstraction. I think you were part of the discussion to purge special TDX private naming from the core MMU to avoid confusion with AMD private memory in the last MMU series. So external page tables ended up being a general concept, and private mem is the TDX use. In practice of course it will likely only be used for TDX. So I thought the external<->private connection here was nice to have. > > > + } > > > > return 0; > > } > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > > index 6feb3ab96926..b8cd5a629a80 100644 > > --- a/arch/x86/kvm/vmx/tdx.c > > +++ b/arch/x86/kvm/vmx/tdx.c > > @@ -447,6 +447,177 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t > > root_hpa, int pgd_level) > > td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); > > } > > > > +static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn) > > +{ > > + struct page *page = pfn_to_page(pfn); > > + > > + put_page(page); > > +} > > + > > +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > > + enum pg_level level, kvm_pfn_t pfn) > > +{ > > + int tdx_level = pg_level_to_tdx_sept_level(level); > > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > > + hpa_t hpa = pfn_to_hpa(pfn); > > + gpa_t gpa = gfn_to_gpa(gfn); > > + u64 entry, level_state; > > + u64 err; > > + > > + err = tdh_mem_page_aug(kvm_tdx, gpa, hpa, &entry, &level_state); > > + if (unlikely(err == TDX_ERROR_SEPT_BUSY)) { > > + tdx_unpin(kvm, pfn); > > + return -EAGAIN; > > + } > > Nit: Here (and other non-fatal error cases) I think we should return > -EBUSY to make it consistent with non-TDX case? E.g., the non-TDX case has: > > if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) > return -EBUSY; > > And the comment of tdp_mmu_set_spte_atomic() currently says it can only > return 0 or -EBUSY. It needs to be patched to reflect it can also > return other non-0 errors like -EIO but those are fatal. In terms of > non-fatal error I don't think we need another -EAGAIN. Yes, good point. > > /* > * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically > > [...] > > * Return: > * * 0 - If the SPTE was set. > * * -EBUSY - If the SPTE cannot be set. In this case this function will > * have no side-effects other than setting iter->old_spte to > * the last known value of the spte. > */ > > [...] > > > + > > +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, > > + enum pg_level level, kvm_pfn_t pfn) > > +{ > > > [...] > > > + > > + hpa_with_hkid = set_hkid_to_hpa(hpa, (u16)kvm_tdx->hkid); > > + do { > > + /* > > + * TDX_OPERAND_BUSY can happen on locking PAMT entry. > > Because > > + * this page was removed above, other thread shouldn't be > > + * repeatedly operating on this page. Just retry loop. > > + */ > > + err = tdh_phymem_page_wbinvd(hpa_with_hkid); > > + } while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX))); > > In what case(s) other threads can concurrently lock the PAMT entry, > leading to the above BUSY? Good question, lets add this to the seamcall retry research. > > [...] > > > + > > +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, > > + enum pg_level level, kvm_pfn_t pfn) > > +{ > > + int ret; > > + > > + /* > > + * HKID is released when vm_free() which is after closing gmem_fd > > From latest dev branch HKID is freed from vt_vm_destroy(), but not > vm_free() (which should be tdx_vm_free() btw). Oh, yes, we should update the comment. > > static void vt_vm_destroy(struct kvm *kvm) > { > if (is_td(kvm)) > return tdx_mmu_release_hkid(kvm); > > vmx_vm_destroy(kvm); > } > > Btw, why not have a tdx_vm_destroy() wrapper? Seems all other vt_xx()s > have a tdx_xx() but only this one calls tdx_mmu_release_hkid() directly. No strong reason. It's asymmetric to the other tdx callbacks, but KVM code tends to be less wrapped and a tdx_vm_destory would be a oneline function. So I think it fits in other ways. > > > + * which causes gmem invalidation to zap all spte. > > + * Population is only allowed after KVM_TDX_INIT_VM. > > + */ > > What does the second sentence ("Population ...") meaning? Why is it > relevant here? > How about: /* * HKID is released after all private pages have been removed, * and set before any might be populated. Warn if zapping is * attempted when there can't be anything populated in the private * EPT. */ But actually, I wonder if we need to remove the KVM_BUG_ON(). I think if you did a KVM_PRE_FAULT_MEMORY and then deleted the memslot you could hit it?
On Tue, Sep 10, 2024 at 05:03:57AM +0800, Edgecombe, Rick P wrote: > On Fri, 2024-09-06 at 14:10 +1200, Huang, Kai wrote: ... > > > + * which causes gmem invalidation to zap all spte. > > > + * Population is only allowed after KVM_TDX_INIT_VM. > > > + */ > > > > What does the second sentence ("Population ...") meaning? Why is it > > relevant here? > > > How about: > /* > * HKID is released after all private pages have been removed, > * and set before any might be populated. Warn if zapping is > * attempted when there can't be anything populated in the private > * EPT. > */ > > But actually, I wonder if we need to remove the KVM_BUG_ON(). I think if you did > a KVM_PRE_FAULT_MEMORY and then deleted the memslot you could hit it? If we disallow vCPU creation before TD initialization, as discussed in [1], the BUG_ON should not be hit. [1] https://lore.kernel.org/all/ZtAU7FIV2Xkw+L3O@yzhao56-desk.sh.intel.com/
On 9/9/24 23:03, Edgecombe, Rick P wrote: > KVM code tends to be less wrapped and a tdx_vm_destory would be a > oneline function. So I think it fits in other ways. Yes, no problem there. Sometimes one-line functions are ok (see ept_sync_global() case elsewhere in the series), sometimes they're overkill, especially if they wrap a function defined in the same file as the wrapper. >>> + * which causes gmem invalidation to zap all spte. >>> + * Population is only allowed after KVM_TDX_INIT_VM. >>> + */ >> What does the second sentence ("Population ...") meaning? Why is it >> relevant here? >> > How about: > /* > * HKID is released after all private pages have been removed, > * and set before any might be populated. Warn if zapping is > * attempted when there can't be anything populated in the private > * EPT. > */ > > But actually, I wonder if we need to remove the KVM_BUG_ON(). I think if you did > a KVM_PRE_FAULT_MEMORY and then deleted the memslot you could hit it? I think all paths to handle_removed_pt() are safe: __tdp_mmu_zap_root tdp_mmu_zap_root kvm_tdp_mmu_zap_all kvm_arch_flush_shadow_all kvm_flush_shadow_all kvm_destroy_vm (*) kvm_mmu_notifier_release (*) kvm_tdp_mmu_zap_invalidated_roots kvm_mmu_zap_all_fast (**) kvm_tdp_mmu_zap_sp kvm_recover_nx_huge_pages (***) (*) only called at destroy time (**) only invalidates direct roots (***) shouldn't apply to TDX I hope? Paolo
On Tue, 2024-09-10 at 11:33 +0200, Paolo Bonzini wrote: > > But actually, I wonder if we need to remove the KVM_BUG_ON(). I think if you > > did > > a KVM_PRE_FAULT_MEMORY and then deleted the memslot you could hit it? > > I think all paths to handle_removed_pt() are safe: > > __tdp_mmu_zap_root > tdp_mmu_zap_root > kvm_tdp_mmu_zap_all > kvm_arch_flush_shadow_all > kvm_flush_shadow_all > kvm_destroy_vm (*) > kvm_mmu_notifier_release (*) > kvm_tdp_mmu_zap_invalidated_roots > kvm_mmu_zap_all_fast (**) > kvm_tdp_mmu_zap_sp > kvm_recover_nx_huge_pages (***) But not all paths to remove_external_spte(): kvm_arch_flush_shadow_memslot() kvm_mmu_zap_memslot_leafs() kvm_tdp_mmu_unmap_gfn_range() tdp_mmu_zap_leafs() tdp_mmu_iter_set_spte() tdp_mmu_set_spte() remove_external_spte() tdx_sept_remove_private_spte() But we can probably keep the warning if we prevent KVM_PRE_FAULT_MEMORY as you pointed earlier. I didn't see that that kvm->arch.pre_fault_allowed got added.
On Wed, Sep 11, 2024 at 07:58:01AM +0800, Edgecombe, Rick P wrote: > On Tue, 2024-09-10 at 11:33 +0200, Paolo Bonzini wrote: > > > But actually, I wonder if we need to remove the KVM_BUG_ON(). I think if you > > > did > > > a KVM_PRE_FAULT_MEMORY and then deleted the memslot you could hit it? > > > > I think all paths to handle_removed_pt() are safe: > > > > __tdp_mmu_zap_root > > tdp_mmu_zap_root > > kvm_tdp_mmu_zap_all > > kvm_arch_flush_shadow_all > > kvm_flush_shadow_all > > kvm_destroy_vm (*) > > kvm_mmu_notifier_release (*) > > kvm_tdp_mmu_zap_invalidated_roots > > kvm_mmu_zap_all_fast (**) > > kvm_tdp_mmu_zap_sp > > kvm_recover_nx_huge_pages (***) > > But not all paths to remove_external_spte(): > kvm_arch_flush_shadow_memslot() > kvm_mmu_zap_memslot_leafs() > kvm_tdp_mmu_unmap_gfn_range() > tdp_mmu_zap_leafs() > tdp_mmu_iter_set_spte() > tdp_mmu_set_spte() > remove_external_spte() > tdx_sept_remove_private_spte() > > But we can probably keep the warning if we prevent KVM_PRE_FAULT_MEMORY as you > pointed earlier. I didn't see that that kvm->arch.pre_fault_allowed got added. Note: If we diallow vCPU to be created before vm ioctl KVM_TDX_INIT_VM is done, the vCPU ioctl KVM_PRE_FAULT_MEMORY can't be executed. Then we can't hit the "if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))" in tdx_sept_remove_private_spte().
On 9/4/2024 11:07 AM, Rick Edgecombe wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > [...] > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 6feb3ab96926..b8cd5a629a80 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -447,6 +447,177 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) > td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); > } > > +static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn) > +{ > + struct page *page = pfn_to_page(pfn); > + > + put_page(page); Nit: It can be put_page(pfn_to_page(pfn)); > +} > + > +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > + enum pg_level level, kvm_pfn_t pfn) > +{ > + int tdx_level = pg_level_to_tdx_sept_level(level); > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + hpa_t hpa = pfn_to_hpa(pfn); > + gpa_t gpa = gfn_to_gpa(gfn); > + u64 entry, level_state; > + u64 err; > + > + err = tdh_mem_page_aug(kvm_tdx, gpa, hpa, &entry, &level_state); Nit: Usually, kernel prefers to handle and return for error conditions first. But for this case, for all error conditions, it needs to unpin the page. Is it better to return the successful case first, so that it only needs to call tdx_unpin() once? > + if (unlikely(err == TDX_ERROR_SEPT_BUSY)) { > + tdx_unpin(kvm, pfn); > + return -EAGAIN; > + } > + if (unlikely(err == (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))) { > + if (tdx_get_sept_level(level_state) == tdx_level && > + tdx_get_sept_state(level_state) == TDX_SEPT_PENDING && > + is_last_spte(entry, level) && > + spte_to_pfn(entry) == pfn && > + entry & VMX_EPT_SUPPRESS_VE_BIT) { Can this condition be triggered? For contention from multiple vCPUs, the winner has frozen the SPTE, it shouldn't trigger this. Could KVM do page aug for a same page multiple times somehow? > + tdx_unpin(kvm, pfn); > + return -EAGAIN; > + } > + } > + if (KVM_BUG_ON(err, kvm)) { > + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); > + tdx_unpin(kvm, pfn); > + return -EIO; > + } > + > + return 0; > +} > + > +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, > + enum pg_level level, kvm_pfn_t pfn) > +{ > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + > + /* TODO: handle large pages. */ > + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > + return -EINVAL; > + > + /* > + * Because guest_memfd doesn't support page migration with > + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page > + * migration. Until guest_memfd supports page migration, prevent page > + * migration. > + * TODO: Once guest_memfd introduces callback on page migration, > + * implement it and remove get_page/put_page(). > + */ > + get_page(pfn_to_page(pfn)); > + > + if (likely(is_td_finalized(kvm_tdx))) > + return tdx_mem_page_aug(kvm, gfn, level, pfn); > + > + /* > + * TODO: KVM_MAP_MEMORY support to populate before finalize comes > + * here for the initial memory. > + */ > + return 0; Is it better to return error before adding the support? > +} > + [...]
On Wed, Oct 30, 2024 at 11:03:39AM +0800, Binbin Wu wrote: > On 9/4/2024 11:07 AM, Rick Edgecombe wrote: > > From: Isaku Yamahata <isaku.yamahata@intel.com> > > > [...] > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > > index 6feb3ab96926..b8cd5a629a80 100644 > > --- a/arch/x86/kvm/vmx/tdx.c > > +++ b/arch/x86/kvm/vmx/tdx.c > > @@ -447,6 +447,177 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) > > td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); > > } > > +static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn) > > +{ > > + struct page *page = pfn_to_page(pfn); > > + > > + put_page(page); > Nit: It can be > put_page(pfn_to_page(pfn)); > Yes, thanks. > > > +} > > + > > +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > > + enum pg_level level, kvm_pfn_t pfn) > > +{ > > + int tdx_level = pg_level_to_tdx_sept_level(level); > > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > > + hpa_t hpa = pfn_to_hpa(pfn); > > + gpa_t gpa = gfn_to_gpa(gfn); > > + u64 entry, level_state; > > + u64 err; > > + > > + err = tdh_mem_page_aug(kvm_tdx, gpa, hpa, &entry, &level_state); > Nit: > Usually, kernel prefers to handle and return for error conditions first. > > But for this case, for all error conditions, it needs to unpin the page. > Is it better to return the successful case first, so that it only needs > to call tdx_unpin() once? > > > + if (unlikely(err == TDX_ERROR_SEPT_BUSY)) { > > + tdx_unpin(kvm, pfn); > > + return -EAGAIN; > > + } As how to handle the busy is not decided up to now, I prefer to leave this hunk as is. > > + if (unlikely(err == (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))) { > > + if (tdx_get_sept_level(level_state) == tdx_level && > > + tdx_get_sept_state(level_state) == TDX_SEPT_PENDING && > > + is_last_spte(entry, level) && > > + spte_to_pfn(entry) == pfn && > > + entry & VMX_EPT_SUPPRESS_VE_BIT) { > Can this condition be triggered? > For contention from multiple vCPUs, the winner has frozen the SPTE, > it shouldn't trigger this. > Could KVM do page aug for a same page multiple times somehow? This condition should not be triggered due to the BUG_ON in set_external_spte_present(). With Isaku's series [1], this condition will not happen either. Will remove it. Thanks! [1] https://lore.kernel.org/all/cover.1728718232.git.isaku.yamahata@intel.com/ > > > + tdx_unpin(kvm, pfn); > > + return -EAGAIN; > > + } > > + } > > + if (KVM_BUG_ON(err, kvm)) { > > + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); > > + tdx_unpin(kvm, pfn); > > + return -EIO; > > + } > > + > > + return 0; > > +} > > + > > +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, > > + enum pg_level level, kvm_pfn_t pfn) > > +{ > > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > > + > > + /* TODO: handle large pages. */ > > + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > > + return -EINVAL; > > + > > + /* > > + * Because guest_memfd doesn't support page migration with > > + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page > > + * migration. Until guest_memfd supports page migration, prevent page > > + * migration. > > + * TODO: Once guest_memfd introduces callback on page migration, > > + * implement it and remove get_page/put_page(). > > + */ > > + get_page(pfn_to_page(pfn)); > > + > > + if (likely(is_td_finalized(kvm_tdx))) > > + return tdx_mem_page_aug(kvm, gfn, level, pfn); > > + > > + /* > > + * TODO: KVM_MAP_MEMORY support to populate before finalize comes > > + * here for the initial memory. > > + */ > > + return 0; > Is it better to return error before adding the support? Hmm, returning error is better, though returning 0 is not wrong. The future tdx_mem_page_record_premap_cnt() just increases kvm_tdx->nr_premapped, while tdx_vcpu_init_mem_region() can be implemented without checking kvm_tdx->nr_premapped. > > +} > > + > [...] > >
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 1c86849680a3..bf6fd5cca1d6 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -36,9 +36,21 @@ static __init int vt_hardware_setup(void) * is KVM may allocate couple of more bytes than needed for * each VM. */ - if (enable_tdx) + if (enable_tdx) { vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); + /* + * Note, TDX may fail to initialize in a later time in + * vt_init(), in which case it is not necessary to setup + * those callbacks. But making them valid here even + * when TDX fails to init later is fine because those + * callbacks won't be called if the VM isn't TDX guest. + */ + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; + } return 0; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 6feb3ab96926..b8cd5a629a80 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -447,6 +447,177 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); } +static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + + put_page(page); +} + +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + hpa_t hpa = pfn_to_hpa(pfn); + gpa_t gpa = gfn_to_gpa(gfn); + u64 entry, level_state; + u64 err; + + err = tdh_mem_page_aug(kvm_tdx, gpa, hpa, &entry, &level_state); + if (unlikely(err == TDX_ERROR_SEPT_BUSY)) { + tdx_unpin(kvm, pfn); + return -EAGAIN; + } + if (unlikely(err == (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))) { + if (tdx_get_sept_level(level_state) == tdx_level && + tdx_get_sept_state(level_state) == TDX_SEPT_PENDING && + is_last_spte(entry, level) && + spte_to_pfn(entry) == pfn && + entry & VMX_EPT_SUPPRESS_VE_BIT) { + tdx_unpin(kvm, pfn); + return -EAGAIN; + } + } + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); + tdx_unpin(kvm, pfn); + return -EIO; + } + + return 0; +} + +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + /* + * Because guest_memfd doesn't support page migration with + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page + * migration. Until guest_memfd supports page migration, prevent page + * migration. + * TODO: Once guest_memfd introduces callback on page migration, + * implement it and remove get_page/put_page(). + */ + get_page(pfn_to_page(pfn)); + + if (likely(is_td_finalized(kvm_tdx))) + return tdx_mem_page_aug(kvm, gfn, level, pfn); + + /* + * TODO: KVM_MAP_MEMORY support to populate before finalize comes + * here for the initial memory. + */ + return 0; +} + +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + hpa_t hpa = pfn_to_hpa(pfn); + hpa_t hpa_with_hkid; + u64 err, entry, level_state; + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + do { + /* + * When zapping private page, write lock is held. So no race + * condition with other vcpu sept operation. Race only with + * TDH.VP.ENTER. + */ + err = tdh_mem_page_remove(kvm_tdx, gpa, tdx_level, &entry, + &level_state); + } while (unlikely(err == TDX_ERROR_SEPT_BUSY)); + if (unlikely(!is_td_finalized(kvm_tdx) && + err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) { + /* + * This page was mapped with KVM_MAP_MEMORY, but + * KVM_TDX_INIT_MEM_REGION is not issued yet. + */ + if (!is_last_spte(entry, level) || !(entry & VMX_EPT_RWX_MASK)) { + tdx_unpin(kvm, pfn); + return 0; + } + } + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + return -EIO; + } + + hpa_with_hkid = set_hkid_to_hpa(hpa, (u16)kvm_tdx->hkid); + do { + /* + * TDX_OPERAND_BUSY can happen on locking PAMT entry. Because + * this page was removed above, other thread shouldn't be + * repeatedly operating on this page. Just retry loop. + */ + err = tdh_phymem_page_wbinvd(hpa_with_hkid); + } while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX))); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return -EIO; + } + tdx_clear_page(hpa); + tdx_unpin(kvm, pfn); + return 0; +} + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + gpa_t gpa = gfn_to_gpa(gfn); + hpa_t hpa = __pa(private_spt); + u64 err, entry, level_state; + + err = tdh_mem_sept_add(to_kvm_tdx(kvm), gpa, tdx_level, hpa, &entry, + &level_state); + if (unlikely(err == TDX_ERROR_SEPT_BUSY)) + return -EAGAIN; + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + return -EIO; + } + + return 0; +} + +static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); + u64 err, entry, level_state; + + /* For now large page isn't supported yet. */ + WARN_ON_ONCE(level != PG_LEVEL_4K); + + err = tdh_mem_range_block(kvm_tdx, gpa, tdx_level, &entry, &level_state); + if (unlikely(err == TDX_ERROR_SEPT_BUSY)) + return -EAGAIN; + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); + return -EIO; + } + return 0; +} + /* * Ensure shared and private EPTs to be flushed on all vCPUs. * tdh_mem_track() is the only caller that increases TD epoch. An increase in @@ -471,7 +642,7 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) * occurs certainly after TD epoch increment and before the next * tdh_mem_track(). */ -static void __always_unused tdx_track(struct kvm *kvm) +static void tdx_track(struct kvm *kvm) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); u64 err; @@ -492,6 +663,55 @@ static void __always_unused tdx_track(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); } +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + /* + * free_external_spt() is only called after hkid is freed when TD is + * tearing down. + * KVM doesn't (yet) zap page table pages in mirror page table while + * TD is active, though guest pages mapped in mirror page table could be + * zapped during TD is active, e.g. for shared <-> private conversion + * and slot move/deletion. + */ + if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * The HKID assigned to this TD was already freed and cache was + * already flushed. We don't have to flush again. + */ + return tdx_reclaim_page(__pa(private_spt)); +} + +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + int ret; + + /* + * HKID is released when vm_free() which is after closing gmem_fd + * which causes gmem invalidation to zap all spte. + * Population is only allowed after KVM_TDX_INIT_VM. + */ + if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) + return -EINVAL; + + ret = tdx_sept_zap_private_spte(kvm, gfn, level); + if (ret) + return ret; + + /* + * TDX requires TLB tracking before dropping private page. Do + * it here, although it is also done later. + */ + tdx_track(kvm); + + return tdx_sept_drop_private_spte(kvm, gfn, level, pfn); +} + static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) { const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h index 815e74408a34..634ed76db26a 100644 --- a/arch/x86/kvm/vmx/tdx_arch.h +++ b/arch/x86/kvm/vmx/tdx_arch.h @@ -155,6 +155,29 @@ struct td_params { #define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) #define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) +/* Additional Secure EPT entry information */ +#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0) +#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8) +#define TDX_SEPT_STATE_SHIFT 8 + +enum tdx_sept_entry_state { + TDX_SEPT_FREE = 0, + TDX_SEPT_BLOCKED = 1, + TDX_SEPT_PENDING = 2, + TDX_SEPT_PENDING_BLOCKED = 3, + TDX_SEPT_PRESENT = 4, +}; + +static inline u8 tdx_get_sept_level(u64 sept_entry_info) +{ + return sept_entry_info & TDX_SEPT_LEVEL_MASK; +} + +static inline u8 tdx_get_sept_state(u64 sept_entry_info) +{ + return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT; +} + #define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20) /* diff --git a/arch/x86/kvm/vmx/tdx_ops.h b/arch/x86/kvm/vmx/tdx_ops.h index 8ca3e252a6ed..73ffd80223b0 100644 --- a/arch/x86/kvm/vmx/tdx_ops.h +++ b/arch/x86/kvm/vmx/tdx_ops.h @@ -31,6 +31,12 @@ #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ pr_tdx_error_N(__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) +static inline int pg_level_to_tdx_sept_level(enum pg_level level) +{ + WARN_ON_ONCE(level == PG_LEVEL_NONE); + return level - 1; +} + /* * TDX module acquires its internal lock for resources. It doesn't spin to get * locks because of its restrictions of allowed execution time. Instead, it diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 28fda93f0b27..d1db807b793a 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -131,6 +131,15 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); + void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); #else @@ -146,6 +155,34 @@ static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {} static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } +static inline int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + void *private_spt) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + void *private_spt) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + kvm_pfn_t pfn) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + kvm_pfn_t pfn) +{ + return -EOPNOTSUPP; +} + static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {} static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {} #endif