diff mbox series

[Part2,v6,37/49] KVM: SVM: Add support to handle MSR based Page State Change VMGEXIT

Message ID 78e30b5a25c926fcfdcaafea3d484f1bb25f20b9.1655761627.git.ashish.kalra@amd.com (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) | expand

Commit Message

Kalra, Ashish June 20, 2022, 11:11 p.m. UTC
From: Brijesh Singh <brijesh.singh@amd.com>

SEV-SNP VMs can ask the hypervisor to change the page state in the RMP
table to be private or shared using the Page State Change MSR protocol
as defined in the GHCB specification.

Before changing the page state in the RMP entry, lookup the page in the
NPT to make sure that there is a valid mapping for it. If the mapping
exist then try to find a workable page level between the NPT and RMP for
the page. If the page is not mapped in the NPT, then create a fault such
that it gets mapped before we change the page state in the RMP entry.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/include/asm/sev-common.h |   9 ++
 arch/x86/kvm/svm/sev.c            | 197 ++++++++++++++++++++++++++++++
 arch/x86/kvm/trace.h              |  34 ++++++
 arch/x86/kvm/x86.c                |   1 +
 4 files changed, 241 insertions(+)

Comments

Peter Gonda Aug. 19, 2022, 4:54 p.m. UTC | #1
> +
> +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa,
> +                                         int level)
> +{
> +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
> +       struct kvm *kvm = vcpu->kvm;
> +       int rc, npt_level;
> +       kvm_pfn_t pfn;
> +       gpa_t gpa_end;
> +
> +       gpa_end = gpa + page_level_size(level);
> +
> +       while (gpa < gpa_end) {
> +               /*
> +                * If the gpa is not present in the NPT then build the NPT.
> +                */
> +               rc = snp_check_and_build_npt(vcpu, gpa, level);
> +               if (rc)
> +                       return -EINVAL;
> +
> +               if (op == SNP_PAGE_STATE_PRIVATE) {
> +                       hva_t hva;
> +
> +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
> +                               return -EINVAL;
> +
> +                       /*
> +                        * Verify that the hva range is registered. This enforcement is
> +                        * required to avoid the cases where a page is marked private
> +                        * in the RMP table but never gets cleanup during the VM
> +                        * termination path.
> +                        */
> +                       mutex_lock(&kvm->lock);
> +                       rc = is_hva_registered(kvm, hva, page_level_size(level));
> +                       mutex_unlock(&kvm->lock);
> +                       if (!rc)
> +                               return -EINVAL;
> +
> +                       /*
> +                        * Mark the userspace range unmerable before adding the pages
> +                        * in the RMP table.
> +                        */
> +                       mmap_write_lock(kvm->mm);
> +                       rc = snp_mark_unmergable(kvm, hva, page_level_size(level));
> +                       mmap_write_unlock(kvm->mm);
> +                       if (rc)
> +                               return -EINVAL;
> +               }
> +
> +               write_lock(&kvm->mmu_lock);
> +
> +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level);
> +               if (!rc) {
> +                       /*
> +                        * This may happen if another vCPU unmapped the page
> +                        * before we acquire the lock. Retry the PSC.
> +                        */
> +                       write_unlock(&kvm->mmu_lock);
> +                       return 0;
> +               }

I think we want to return -EAGAIN or similar if we want the caller to
retry, right? I think returning 0 here hides the error.

> +
> +               /*
> +                * Adjust the level so that we don't go higher than the backing
> +                * page level.
> +                */
> +               level = min_t(size_t, level, npt_level);
> +
> +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level);
> +
> +               switch (op) {
> +               case SNP_PAGE_STATE_SHARED:
> +                       rc = snp_make_page_shared(kvm, gpa, pfn, level);
> +                       break;
> +               case SNP_PAGE_STATE_PRIVATE:
> +                       rc = rmp_make_private(pfn, gpa, level, sev->asid, false);
> +                       break;
> +               default:
> +                       rc = -EINVAL;
> +                       break;
> +               }
> +
> +               write_unlock(&kvm->mmu_lock);
> +
> +               if (rc) {
> +                       pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n",
> +                                          op, gpa, pfn, level, rc);
> +                       return rc;
> +               }
> +
> +               gpa = gpa + page_level_size(level);
> +       }
> +
> +       return 0;
> +}
> +
Alper Gun Sept. 19, 2022, 5:53 p.m. UTC | #2
On Fri, Aug 19, 2022 at 9:54 AM Peter Gonda <pgonda@google.com> wrote:
>
> > +
> > +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa,
> > +                                         int level)
> > +{
> > +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
> > +       struct kvm *kvm = vcpu->kvm;
> > +       int rc, npt_level;
> > +       kvm_pfn_t pfn;
> > +       gpa_t gpa_end;
> > +
> > +       gpa_end = gpa + page_level_size(level);
> > +
> > +       while (gpa < gpa_end) {
> > +               /*
> > +                * If the gpa is not present in the NPT then build the NPT.
> > +                */
> > +               rc = snp_check_and_build_npt(vcpu, gpa, level);
> > +               if (rc)
> > +                       return -EINVAL;
> > +
> > +               if (op == SNP_PAGE_STATE_PRIVATE) {
> > +                       hva_t hva;
> > +
> > +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
> > +                               return -EINVAL;
> > +
> > +                       /*
> > +                        * Verify that the hva range is registered. This enforcement is
> > +                        * required to avoid the cases where a page is marked private
> > +                        * in the RMP table but never gets cleanup during the VM
> > +                        * termination path.
> > +                        */
> > +                       mutex_lock(&kvm->lock);
> > +                       rc = is_hva_registered(kvm, hva, page_level_size(level));
> > +                       mutex_unlock(&kvm->lock);
> > +                       if (!rc)
> > +                               return -EINVAL;
> > +
> > +                       /*
> > +                        * Mark the userspace range unmerable before adding the pages
> > +                        * in the RMP table.
> > +                        */
> > +                       mmap_write_lock(kvm->mm);
> > +                       rc = snp_mark_unmergable(kvm, hva, page_level_size(level));
> > +                       mmap_write_unlock(kvm->mm);
> > +                       if (rc)
> > +                               return -EINVAL;
> > +               }
> > +
> > +               write_lock(&kvm->mmu_lock);
> > +
> > +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level);
> > +               if (!rc) {
> > +                       /*
> > +                        * This may happen if another vCPU unmapped the page
> > +                        * before we acquire the lock. Retry the PSC.
> > +                        */
> > +                       write_unlock(&kvm->mmu_lock);
> > +                       return 0;
> > +               }
>
> I think we want to return -EAGAIN or similar if we want the caller to
> retry, right? I think returning 0 here hides the error.
>

The problem here is that the caller(linux guest kernel) doesn't retry
if PSC fails. The current implementation in the guest kernel is that
if a page state change request fails, it terminates the VM with
GHCB_TERM_PSC reason.
Returning 0 here is not a good option because it will fail the PSC
silently and will probably cause a nested RMP fault later. Returning
an error also terminates the guest immediately with current guest
implementation. I think the best approach here is adding a retry logic
to this function. Retrying without returning an error should help it
work because snp_check_and_build_npt will be called again and in the
second attempt this should work.

> > +
> > +               /*
> > +                * Adjust the level so that we don't go higher than the backing
> > +                * page level.
> > +                */
> > +               level = min_t(size_t, level, npt_level);
> > +
> > +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level);
> > +
> > +               switch (op) {
> > +               case SNP_PAGE_STATE_SHARED:
> > +                       rc = snp_make_page_shared(kvm, gpa, pfn, level);
> > +                       break;
> > +               case SNP_PAGE_STATE_PRIVATE:
> > +                       rc = rmp_make_private(pfn, gpa, level, sev->asid, false);
> > +                       break;
> > +               default:
> > +                       rc = -EINVAL;
> > +                       break;
> > +               }
> > +
> > +               write_unlock(&kvm->mmu_lock);
> > +
> > +               if (rc) {
> > +                       pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n",
> > +                                          op, gpa, pfn, level, rc);
> > +                       return rc;
> > +               }
> > +
> > +               gpa = gpa + page_level_size(level);
> > +       }
> > +
> > +       return 0;
> > +}
> > +
Tom Lendacky Sept. 19, 2022, 9:38 p.m. UTC | #3
On 9/19/22 12:53, Alper Gun wrote:
> On Fri, Aug 19, 2022 at 9:54 AM Peter Gonda <pgonda@google.com> wrote:
>>
>>> +
>>> +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa,
>>> +                                         int level)
>>> +{
>>> +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
>>> +       struct kvm *kvm = vcpu->kvm;
>>> +       int rc, npt_level;
>>> +       kvm_pfn_t pfn;
>>> +       gpa_t gpa_end;
>>> +
>>> +       gpa_end = gpa + page_level_size(level);
>>> +
>>> +       while (gpa < gpa_end) {
>>> +               /*
>>> +                * If the gpa is not present in the NPT then build the NPT.
>>> +                */
>>> +               rc = snp_check_and_build_npt(vcpu, gpa, level);
>>> +               if (rc)
>>> +                       return -EINVAL;
>>> +
>>> +               if (op == SNP_PAGE_STATE_PRIVATE) {
>>> +                       hva_t hva;
>>> +
>>> +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
>>> +                               return -EINVAL;
>>> +
>>> +                       /*
>>> +                        * Verify that the hva range is registered. This enforcement is
>>> +                        * required to avoid the cases where a page is marked private
>>> +                        * in the RMP table but never gets cleanup during the VM
>>> +                        * termination path.
>>> +                        */
>>> +                       mutex_lock(&kvm->lock);
>>> +                       rc = is_hva_registered(kvm, hva, page_level_size(level));
>>> +                       mutex_unlock(&kvm->lock);
>>> +                       if (!rc)
>>> +                               return -EINVAL;
>>> +
>>> +                       /*
>>> +                        * Mark the userspace range unmerable before adding the pages
>>> +                        * in the RMP table.
>>> +                        */
>>> +                       mmap_write_lock(kvm->mm);
>>> +                       rc = snp_mark_unmergable(kvm, hva, page_level_size(level));
>>> +                       mmap_write_unlock(kvm->mm);
>>> +                       if (rc)
>>> +                               return -EINVAL;
>>> +               }
>>> +
>>> +               write_lock(&kvm->mmu_lock);
>>> +
>>> +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level);
>>> +               if (!rc) {
>>> +                       /*
>>> +                        * This may happen if another vCPU unmapped the page
>>> +                        * before we acquire the lock. Retry the PSC.
>>> +                        */
>>> +                       write_unlock(&kvm->mmu_lock);
>>> +                       return 0;
>>> +               }
>>
>> I think we want to return -EAGAIN or similar if we want the caller to
>> retry, right? I think returning 0 here hides the error.
>>
> 
> The problem here is that the caller(linux guest kernel) doesn't retry
> if PSC fails. The current implementation in the guest kernel is that
> if a page state change request fails, it terminates the VM with
> GHCB_TERM_PSC reason.
> Returning 0 here is not a good option because it will fail the PSC
> silently and will probably cause a nested RMP fault later. Returning

Returning 0 here is ok because the PSC current index into the PSC 
structure will not be updated and the guest will then retry (see the loop 
in vmgexit_psc() in arch/x86/kernel/sev.c).

Thanks,
Tom

> an error also terminates the guest immediately with current guest
> implementation. I think the best approach here is adding a retry logic
> to this function. Retrying without returning an error should help it
> work because snp_check_and_build_npt will be called again and in the
> second attempt this should work.
> 
>>> +
>>> +               /*
>>> +                * Adjust the level so that we don't go higher than the backing
>>> +                * page level.
>>> +                */
>>> +               level = min_t(size_t, level, npt_level);
>>> +
>>> +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level);
>>> +
>>> +               switch (op) {
>>> +               case SNP_PAGE_STATE_SHARED:
>>> +                       rc = snp_make_page_shared(kvm, gpa, pfn, level);
>>> +                       break;
>>> +               case SNP_PAGE_STATE_PRIVATE:
>>> +                       rc = rmp_make_private(pfn, gpa, level, sev->asid, false);
>>> +                       break;
>>> +               default:
>>> +                       rc = -EINVAL;
>>> +                       break;
>>> +               }
>>> +
>>> +               write_unlock(&kvm->mmu_lock);
>>> +
>>> +               if (rc) {
>>> +                       pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n",
>>> +                                          op, gpa, pfn, level, rc);
>>> +                       return rc;
>>> +               }
>>> +
>>> +               gpa = gpa + page_level_size(level);
>>> +       }
>>> +
>>> +       return 0;
>>> +}
>>> +
Alper Gun Sept. 19, 2022, 10:02 p.m. UTC | #4
On Mon, Sep 19, 2022 at 2:38 PM Tom Lendacky <thomas.lendacky@amd.com> wrote:
>
> On 9/19/22 12:53, Alper Gun wrote:
> > On Fri, Aug 19, 2022 at 9:54 AM Peter Gonda <pgonda@google.com> wrote:
> >>
> >>> +
> >>> +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa,
> >>> +                                         int level)
> >>> +{
> >>> +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
> >>> +       struct kvm *kvm = vcpu->kvm;
> >>> +       int rc, npt_level;
> >>> +       kvm_pfn_t pfn;
> >>> +       gpa_t gpa_end;
> >>> +
> >>> +       gpa_end = gpa + page_level_size(level);
> >>> +
> >>> +       while (gpa < gpa_end) {
> >>> +               /*
> >>> +                * If the gpa is not present in the NPT then build the NPT.
> >>> +                */
> >>> +               rc = snp_check_and_build_npt(vcpu, gpa, level);
> >>> +               if (rc)
> >>> +                       return -EINVAL;
> >>> +
> >>> +               if (op == SNP_PAGE_STATE_PRIVATE) {
> >>> +                       hva_t hva;
> >>> +
> >>> +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
> >>> +                               return -EINVAL;
> >>> +
> >>> +                       /*
> >>> +                        * Verify that the hva range is registered. This enforcement is
> >>> +                        * required to avoid the cases where a page is marked private
> >>> +                        * in the RMP table but never gets cleanup during the VM
> >>> +                        * termination path.
> >>> +                        */
> >>> +                       mutex_lock(&kvm->lock);
> >>> +                       rc = is_hva_registered(kvm, hva, page_level_size(level));
> >>> +                       mutex_unlock(&kvm->lock);
> >>> +                       if (!rc)
> >>> +                               return -EINVAL;
> >>> +
> >>> +                       /*
> >>> +                        * Mark the userspace range unmerable before adding the pages
> >>> +                        * in the RMP table.
> >>> +                        */
> >>> +                       mmap_write_lock(kvm->mm);
> >>> +                       rc = snp_mark_unmergable(kvm, hva, page_level_size(level));
> >>> +                       mmap_write_unlock(kvm->mm);
> >>> +                       if (rc)
> >>> +                               return -EINVAL;
> >>> +               }
> >>> +
> >>> +               write_lock(&kvm->mmu_lock);
> >>> +
> >>> +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level);
> >>> +               if (!rc) {
> >>> +                       /*
> >>> +                        * This may happen if another vCPU unmapped the page
> >>> +                        * before we acquire the lock. Retry the PSC.
> >>> +                        */
> >>> +                       write_unlock(&kvm->mmu_lock);
> >>> +                       return 0;
> >>> +               }
> >>
> >> I think we want to return -EAGAIN or similar if we want the caller to
> >> retry, right? I think returning 0 here hides the error.
> >>
> >
> > The problem here is that the caller(linux guest kernel) doesn't retry
> > if PSC fails. The current implementation in the guest kernel is that
> > if a page state change request fails, it terminates the VM with
> > GHCB_TERM_PSC reason.
> > Returning 0 here is not a good option because it will fail the PSC
> > silently and will probably cause a nested RMP fault later. Returning
>
> Returning 0 here is ok because the PSC current index into the PSC
> structure will not be updated and the guest will then retry (see the loop
> in vmgexit_psc() in arch/x86/kernel/sev.c).
>
> Thanks,
> Tom

But the host code updates the index. It doesn't leave the loop because
rc is 0. The guest will think that it is successful.
rc = __snp_handle_page_state_change(vcpu, op, gpa, level);
if (rc)
goto out;

Also the page state change request with MSR is not retried. It
terminates the VM if the MSR request fails.

>
> > an error also terminates the guest immediately with current guest
> > implementation. I think the best approach here is adding a retry logic
> > to this function. Retrying without returning an error should help it
> > work because snp_check_and_build_npt will be called again and in the
> > second attempt this should work.
> >
> >>> +
> >>> +               /*
> >>> +                * Adjust the level so that we don't go higher than the backing
> >>> +                * page level.
> >>> +                */
> >>> +               level = min_t(size_t, level, npt_level);
> >>> +
> >>> +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level);
> >>> +
> >>> +               switch (op) {
> >>> +               case SNP_PAGE_STATE_SHARED:
> >>> +                       rc = snp_make_page_shared(kvm, gpa, pfn, level);
> >>> +                       break;
> >>> +               case SNP_PAGE_STATE_PRIVATE:
> >>> +                       rc = rmp_make_private(pfn, gpa, level, sev->asid, false);
> >>> +                       break;
> >>> +               default:
> >>> +                       rc = -EINVAL;
> >>> +                       break;
> >>> +               }
> >>> +
> >>> +               write_unlock(&kvm->mmu_lock);
> >>> +
> >>> +               if (rc) {
> >>> +                       pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n",
> >>> +                                          op, gpa, pfn, level, rc);
> >>> +                       return rc;
> >>> +               }
> >>> +
> >>> +               gpa = gpa + page_level_size(level);
> >>> +       }
> >>> +
> >>> +       return 0;
> >>> +}
> >>> +
Tom Lendacky Sept. 19, 2022, 10:18 p.m. UTC | #5
On 9/19/22 17:02, Alper Gun wrote:
> On Mon, Sep 19, 2022 at 2:38 PM Tom Lendacky <thomas.lendacky@amd.com> wrote:
>>
>> On 9/19/22 12:53, Alper Gun wrote:
>>> On Fri, Aug 19, 2022 at 9:54 AM Peter Gonda <pgonda@google.com> wrote:
>>>>
>>>>> +
>>>>> +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa,
>>>>> +                                         int level)
>>>>> +{
>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
>>>>> +       struct kvm *kvm = vcpu->kvm;
>>>>> +       int rc, npt_level;
>>>>> +       kvm_pfn_t pfn;
>>>>> +       gpa_t gpa_end;
>>>>> +
>>>>> +       gpa_end = gpa + page_level_size(level);
>>>>> +
>>>>> +       while (gpa < gpa_end) {
>>>>> +               /*
>>>>> +                * If the gpa is not present in the NPT then build the NPT.
>>>>> +                */
>>>>> +               rc = snp_check_and_build_npt(vcpu, gpa, level);
>>>>> +               if (rc)
>>>>> +                       return -EINVAL;
>>>>> +
>>>>> +               if (op == SNP_PAGE_STATE_PRIVATE) {
>>>>> +                       hva_t hva;
>>>>> +
>>>>> +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
>>>>> +                               return -EINVAL;
>>>>> +
>>>>> +                       /*
>>>>> +                        * Verify that the hva range is registered. This enforcement is
>>>>> +                        * required to avoid the cases where a page is marked private
>>>>> +                        * in the RMP table but never gets cleanup during the VM
>>>>> +                        * termination path.
>>>>> +                        */
>>>>> +                       mutex_lock(&kvm->lock);
>>>>> +                       rc = is_hva_registered(kvm, hva, page_level_size(level));
>>>>> +                       mutex_unlock(&kvm->lock);
>>>>> +                       if (!rc)
>>>>> +                               return -EINVAL;
>>>>> +
>>>>> +                       /*
>>>>> +                        * Mark the userspace range unmerable before adding the pages
>>>>> +                        * in the RMP table.
>>>>> +                        */
>>>>> +                       mmap_write_lock(kvm->mm);
>>>>> +                       rc = snp_mark_unmergable(kvm, hva, page_level_size(level));
>>>>> +                       mmap_write_unlock(kvm->mm);
>>>>> +                       if (rc)
>>>>> +                               return -EINVAL;
>>>>> +               }
>>>>> +
>>>>> +               write_lock(&kvm->mmu_lock);
>>>>> +
>>>>> +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level);
>>>>> +               if (!rc) {
>>>>> +                       /*
>>>>> +                        * This may happen if another vCPU unmapped the page
>>>>> +                        * before we acquire the lock. Retry the PSC.
>>>>> +                        */
>>>>> +                       write_unlock(&kvm->mmu_lock);
>>>>> +                       return 0;
>>>>> +               }
>>>>
>>>> I think we want to return -EAGAIN or similar if we want the caller to
>>>> retry, right? I think returning 0 here hides the error.
>>>>
>>>
>>> The problem here is that the caller(linux guest kernel) doesn't retry
>>> if PSC fails. The current implementation in the guest kernel is that
>>> if a page state change request fails, it terminates the VM with
>>> GHCB_TERM_PSC reason.
>>> Returning 0 here is not a good option because it will fail the PSC
>>> silently and will probably cause a nested RMP fault later. Returning
>>
>> Returning 0 here is ok because the PSC current index into the PSC
>> structure will not be updated and the guest will then retry (see the loop
>> in vmgexit_psc() in arch/x86/kernel/sev.c).
>>
>> Thanks,
>> Tom
> 
> But the host code updates the index. It doesn't leave the loop because
> rc is 0. The guest will think that it is successful.
> rc = __snp_handle_page_state_change(vcpu, op, gpa, level);
> if (rc)
> goto out;
> 
> Also the page state change request with MSR is not retried. It
> terminates the VM if the MSR request fails.

Ah, right. I see what you mean. It should probably return a -EAGAIN 
instead of 0 and then the if (rc) check should be modified to specifically 
look for -EAGAIN and goto out after setting rc to 0.

But that does leave the MSR protocol open to the problem that you mention, 
so, yes, retry logic in snp_handle_page_state_change() for a -EAGAIN seems 
reasonable.

Thanks,
Tom

> 
>>
>>> an error also terminates the guest immediately with current guest
>>> implementation. I think the best approach here is adding a retry logic
>>> to this function. Retrying without returning an error should help it
>>> work because snp_check_and_build_npt will be called again and in the
>>> second attempt this should work.
>>>
>>>>> +
>>>>> +               /*
>>>>> +                * Adjust the level so that we don't go higher than the backing
>>>>> +                * page level.
>>>>> +                */
>>>>> +               level = min_t(size_t, level, npt_level);
>>>>> +
>>>>> +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level);
>>>>> +
>>>>> +               switch (op) {
>>>>> +               case SNP_PAGE_STATE_SHARED:
>>>>> +                       rc = snp_make_page_shared(kvm, gpa, pfn, level);
>>>>> +                       break;
>>>>> +               case SNP_PAGE_STATE_PRIVATE:
>>>>> +                       rc = rmp_make_private(pfn, gpa, level, sev->asid, false);
>>>>> +                       break;
>>>>> +               default:
>>>>> +                       rc = -EINVAL;
>>>>> +                       break;
>>>>> +               }
>>>>> +
>>>>> +               write_unlock(&kvm->mmu_lock);
>>>>> +
>>>>> +               if (rc) {
>>>>> +                       pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n",
>>>>> +                                          op, gpa, pfn, level, rc);
>>>>> +                       return rc;
>>>>> +               }
>>>>> +
>>>>> +               gpa = gpa + page_level_size(level);
>>>>> +       }
>>>>> +
>>>>> +       return 0;
>>>>> +}
>>>>> +
Ashish Kalra Sept. 19, 2022, 11:46 p.m. UTC | #6
On 9/19/22 22:18, Tom Lendacky wrote:
> On 9/19/22 17:02, Alper Gun wrote:
>> On Mon, Sep 19, 2022 at 2:38 PM Tom Lendacky 
>> <thomas.lendacky@amd.com> wrote:
>>>
>>> On 9/19/22 12:53, Alper Gun wrote:
>>>> On Fri, Aug 19, 2022 at 9:54 AM Peter Gonda <pgonda@google.com> wrote:
>>>>>
>>>>>> +
>>>>>> +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, 
>>>>>> enum psc_op op, gpa_t gpa,
>>>>>> +                                         int level)
>>>>>> +{
>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
>>>>>> +       struct kvm *kvm = vcpu->kvm;
>>>>>> +       int rc, npt_level;
>>>>>> +       kvm_pfn_t pfn;
>>>>>> +       gpa_t gpa_end;
>>>>>> +
>>>>>> +       gpa_end = gpa + page_level_size(level);
>>>>>> +
>>>>>> +       while (gpa < gpa_end) {
>>>>>> +               /*
>>>>>> +                * If the gpa is not present in the NPT then 
>>>>>> build the NPT.
>>>>>> +                */
>>>>>> +               rc = snp_check_and_build_npt(vcpu, gpa, level);
>>>>>> +               if (rc)
>>>>>> +                       return -EINVAL;
>>>>>> +
>>>>>> +               if (op == SNP_PAGE_STATE_PRIVATE) {
>>>>>> +                       hva_t hva;
>>>>>> +
>>>>>> +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
>>>>>> +                               return -EINVAL;
>>>>>> +
>>>>>> +                       /*
>>>>>> +                        * Verify that the hva range is 
>>>>>> registered. This enforcement is
>>>>>> +                        * required to avoid the cases where a 
>>>>>> page is marked private
>>>>>> +                        * in the RMP table but never gets 
>>>>>> cleanup during the VM
>>>>>> +                        * termination path.
>>>>>> +                        */
>>>>>> +                       mutex_lock(&kvm->lock);
>>>>>> +                       rc = is_hva_registered(kvm, hva, 
>>>>>> page_level_size(level));
>>>>>> +                       mutex_unlock(&kvm->lock);
>>>>>> +                       if (!rc)
>>>>>> +                               return -EINVAL;
>>>>>> +
>>>>>> +                       /*
>>>>>> +                        * Mark the userspace range unmerable 
>>>>>> before adding the pages
>>>>>> +                        * in the RMP table.
>>>>>> +                        */
>>>>>> +                       mmap_write_lock(kvm->mm);
>>>>>> +                       rc = snp_mark_unmergable(kvm, hva, 
>>>>>> page_level_size(level));
>>>>>> +                       mmap_write_unlock(kvm->mm);
>>>>>> +                       if (rc)
>>>>>> +                               return -EINVAL;
>>>>>> +               }
>>>>>> +
>>>>>> +               write_lock(&kvm->mmu_lock);
>>>>>> +
>>>>>> +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, 
>>>>>> &npt_level);
>>>>>> +               if (!rc) {
>>>>>> +                       /*
>>>>>> +                        * This may happen if another vCPU 
>>>>>> unmapped the page
>>>>>> +                        * before we acquire the lock. Retry the 
>>>>>> PSC.
>>>>>> +                        */
>>>>>> + write_unlock(&kvm->mmu_lock);
>>>>>> +                       return 0;
>>>>>> +               }
>>>>>
>>>>> I think we want to return -EAGAIN or similar if we want the caller to
>>>>> retry, right? I think returning 0 here hides the error.
>>>>>
>>>>
>>>> The problem here is that the caller(linux guest kernel) doesn't retry
>>>> if PSC fails. The current implementation in the guest kernel is that
>>>> if a page state change request fails, it terminates the VM with
>>>> GHCB_TERM_PSC reason.
>>>> Returning 0 here is not a good option because it will fail the PSC
>>>> silently and will probably cause a nested RMP fault later. Returning
>>>
>>> Returning 0 here is ok because the PSC current index into the PSC
>>> structure will not be updated and the guest will then retry (see the 
>>> loop
>>> in vmgexit_psc() in arch/x86/kernel/sev.c).
>>>
>>> Thanks,
>>> Tom
>>
>> But the host code updates the index. It doesn't leave the loop because
>> rc is 0. The guest will think that it is successful.
>> rc = __snp_handle_page_state_change(vcpu, op, gpa, level);
>> if (rc)
>> goto out;
>>
>> Also the page state change request with MSR is not retried. It
>> terminates the VM if the MSR request fails.
>
> Ah, right. I see what you mean. It should probably return a -EAGAIN 
> instead of 0 and then the if (rc) check should be modified to 
> specifically look for -EAGAIN and goto out after setting rc to 0.
>
> But that does leave the MSR protocol open to the problem that you 
> mention, so, yes, retry logic in snp_handle_page_state_change() for a 
> -EAGAIN seems reasonable.
>
> Thanks,
> Tom

I believe it makes more sense to add the retry logic within 
__snp_handle_page_state_change() itself, as that will make it work for 
both the GHCB MSR protocol and the GHCB VMGEXIT requests.

Thanks, Ashish

>
>>
>>>
>>>> an error also terminates the guest immediately with current guest
>>>> implementation. I think the best approach here is adding a retry logic
>>>> to this function. Retrying without returning an error should help it
>>>> work because snp_check_and_build_npt will be called again and in the
>>>> second attempt this should work.
>>>>
>>>>>> +
>>>>>> +               /*
>>>>>> +                * Adjust the level so that we don't go higher 
>>>>>> than the backing
>>>>>> +                * page level.
>>>>>> +                */
>>>>>> +               level = min_t(size_t, level, npt_level);
>>>>>> +
>>>>>> +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, 
>>>>>> level);
>>>>>> +
>>>>>> +               switch (op) {
>>>>>> +               case SNP_PAGE_STATE_SHARED:
>>>>>> +                       rc = snp_make_page_shared(kvm, gpa, pfn, 
>>>>>> level);
>>>>>> +                       break;
>>>>>> +               case SNP_PAGE_STATE_PRIVATE:
>>>>>> +                       rc = rmp_make_private(pfn, gpa, level, 
>>>>>> sev->asid, false);
>>>>>> +                       break;
>>>>>> +               default:
>>>>>> +                       rc = -EINVAL;
>>>>>> +                       break;
>>>>>> +               }
>>>>>> +
>>>>>> +               write_unlock(&kvm->mmu_lock);
>>>>>> +
>>>>>> +               if (rc) {
>>>>>> +                       pr_err_ratelimited("Error op %d gpa %llx 
>>>>>> pfn %llx level %d rc %d\n",
>>>>>> +                                          op, gpa, pfn, level, rc);
>>>>>> +                       return rc;
>>>>>> +               }
>>>>>> +
>>>>>> +               gpa = gpa + page_level_size(level);
>>>>>> +       }
>>>>>> +
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +
Peter Gonda Sept. 26, 2022, 3:19 p.m. UTC | #7
On Mon, Sep 19, 2022 at 5:47 PM Ashish Kalra <ashkalra@amd.com> wrote:
>
>
> On 9/19/22 22:18, Tom Lendacky wrote:
> > On 9/19/22 17:02, Alper Gun wrote:
> >> On Mon, Sep 19, 2022 at 2:38 PM Tom Lendacky
> >> <thomas.lendacky@amd.com> wrote:
> >>>
> >>> On 9/19/22 12:53, Alper Gun wrote:
> >>>> On Fri, Aug 19, 2022 at 9:54 AM Peter Gonda <pgonda@google.com> wrote:
> >>>>>
> >>>>>> +
> >>>>>> +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu,
> >>>>>> enum psc_op op, gpa_t gpa,
> >>>>>> +                                         int level)
> >>>>>> +{
> >>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
> >>>>>> +       struct kvm *kvm = vcpu->kvm;
> >>>>>> +       int rc, npt_level;
> >>>>>> +       kvm_pfn_t pfn;
> >>>>>> +       gpa_t gpa_end;
> >>>>>> +
> >>>>>> +       gpa_end = gpa + page_level_size(level);
> >>>>>> +
> >>>>>> +       while (gpa < gpa_end) {
> >>>>>> +               /*
> >>>>>> +                * If the gpa is not present in the NPT then
> >>>>>> build the NPT.
> >>>>>> +                */
> >>>>>> +               rc = snp_check_and_build_npt(vcpu, gpa, level);
> >>>>>> +               if (rc)
> >>>>>> +                       return -EINVAL;
> >>>>>> +
> >>>>>> +               if (op == SNP_PAGE_STATE_PRIVATE) {
> >>>>>> +                       hva_t hva;
> >>>>>> +
> >>>>>> +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
> >>>>>> +                               return -EINVAL;
> >>>>>> +
> >>>>>> +                       /*
> >>>>>> +                        * Verify that the hva range is
> >>>>>> registered. This enforcement is
> >>>>>> +                        * required to avoid the cases where a
> >>>>>> page is marked private
> >>>>>> +                        * in the RMP table but never gets
> >>>>>> cleanup during the VM
> >>>>>> +                        * termination path.
> >>>>>> +                        */
> >>>>>> +                       mutex_lock(&kvm->lock);
> >>>>>> +                       rc = is_hva_registered(kvm, hva,
> >>>>>> page_level_size(level));
> >>>>>> +                       mutex_unlock(&kvm->lock);
> >>>>>> +                       if (!rc)
> >>>>>> +                               return -EINVAL;
> >>>>>> +
> >>>>>> +                       /*
> >>>>>> +                        * Mark the userspace range unmerable
> >>>>>> before adding the pages
> >>>>>> +                        * in the RMP table.
> >>>>>> +                        */
> >>>>>> +                       mmap_write_lock(kvm->mm);
> >>>>>> +                       rc = snp_mark_unmergable(kvm, hva,
> >>>>>> page_level_size(level));
> >>>>>> +                       mmap_write_unlock(kvm->mm);
> >>>>>> +                       if (rc)
> >>>>>> +                               return -EINVAL;
> >>>>>> +               }
> >>>>>> +
> >>>>>> +               write_lock(&kvm->mmu_lock);
> >>>>>> +
> >>>>>> +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn,
> >>>>>> &npt_level);
> >>>>>> +               if (!rc) {
> >>>>>> +                       /*
> >>>>>> +                        * This may happen if another vCPU
> >>>>>> unmapped the page
> >>>>>> +                        * before we acquire the lock. Retry the
> >>>>>> PSC.
> >>>>>> +                        */
> >>>>>> + write_unlock(&kvm->mmu_lock);
> >>>>>> +                       return 0;
> >>>>>> +               }
> >>>>>
> >>>>> I think we want to return -EAGAIN or similar if we want the caller to
> >>>>> retry, right? I think returning 0 here hides the error.
> >>>>>
> >>>>
> >>>> The problem here is that the caller(linux guest kernel) doesn't retry
> >>>> if PSC fails. The current implementation in the guest kernel is that
> >>>> if a page state change request fails, it terminates the VM with
> >>>> GHCB_TERM_PSC reason.
> >>>> Returning 0 here is not a good option because it will fail the PSC
> >>>> silently and will probably cause a nested RMP fault later. Returning
> >>>
> >>> Returning 0 here is ok because the PSC current index into the PSC
> >>> structure will not be updated and the guest will then retry (see the
> >>> loop
> >>> in vmgexit_psc() in arch/x86/kernel/sev.c).
> >>>
> >>> Thanks,
> >>> Tom
> >>
> >> But the host code updates the index. It doesn't leave the loop because
> >> rc is 0. The guest will think that it is successful.
> >> rc = __snp_handle_page_state_change(vcpu, op, gpa, level);
> >> if (rc)
> >> goto out;
> >>
> >> Also the page state change request with MSR is not retried. It
> >> terminates the VM if the MSR request fails.
> >
> > Ah, right. I see what you mean. It should probably return a -EAGAIN
> > instead of 0 and then the if (rc) check should be modified to
> > specifically look for -EAGAIN and goto out after setting rc to 0.
> >
> > But that does leave the MSR protocol open to the problem that you
> > mention, so, yes, retry logic in snp_handle_page_state_change() for a
> > -EAGAIN seems reasonable.
> >
> > Thanks,
> > Tom
>
> I believe it makes more sense to add the retry logic within
> __snp_handle_page_state_change() itself, as that will make it work for
> both the GHCB MSR protocol and the GHCB VMGEXIT requests.

You are suggesting we just retry 'kvm_mmu_get_tdp_walk' inside of
__snp_handle_page_state_change()? That should work but how many times
do we retry? If we return EAGAIN or error we can leave it up to the
caller

>
> Thanks, Ashish
>
> >
> >>
> >>>
> >>>> an error also terminates the guest immediately with current guest
> >>>> implementation. I think the best approach here is adding a retry logic
> >>>> to this function. Retrying without returning an error should help it
> >>>> work because snp_check_and_build_npt will be called again and in the
> >>>> second attempt this should work.
> >>>>
> >>>>>> +
> >>>>>> +               /*
> >>>>>> +                * Adjust the level so that we don't go higher
> >>>>>> than the backing
> >>>>>> +                * page level.
> >>>>>> +                */
> >>>>>> +               level = min_t(size_t, level, npt_level);
> >>>>>> +
> >>>>>> +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op,
> >>>>>> level);
> >>>>>> +
> >>>>>> +               switch (op) {
> >>>>>> +               case SNP_PAGE_STATE_SHARED:
> >>>>>> +                       rc = snp_make_page_shared(kvm, gpa, pfn,
> >>>>>> level);
> >>>>>> +                       break;
> >>>>>> +               case SNP_PAGE_STATE_PRIVATE:
> >>>>>> +                       rc = rmp_make_private(pfn, gpa, level,
> >>>>>> sev->asid, false);
> >>>>>> +                       break;
> >>>>>> +               default:
> >>>>>> +                       rc = -EINVAL;
> >>>>>> +                       break;
> >>>>>> +               }
> >>>>>> +
> >>>>>> +               write_unlock(&kvm->mmu_lock);
> >>>>>> +
> >>>>>> +               if (rc) {
> >>>>>> +                       pr_err_ratelimited("Error op %d gpa %llx
> >>>>>> pfn %llx level %d rc %d\n",
> >>>>>> +                                          op, gpa, pfn, level, rc);
> >>>>>> +                       return rc;
> >>>>>> +               }
> >>>>>> +
> >>>>>> +               gpa = gpa + page_level_size(level);
> >>>>>> +       }
> >>>>>> +
> >>>>>> +       return 0;
> >>>>>> +}
> >>>>>> +
Kalra, Ashish Oct. 12, 2022, 8:15 p.m. UTC | #8
On 9/26/2022 10:19 AM, Peter Gonda wrote:
> On Mon, Sep 19, 2022 at 5:47 PM Ashish Kalra <ashkalra@amd.com> wrote:
>>
>>
>> On 9/19/22 22:18, Tom Lendacky wrote:
>>> On 9/19/22 17:02, Alper Gun wrote:
>>>> On Mon, Sep 19, 2022 at 2:38 PM Tom Lendacky
>>>> <thomas.lendacky@amd.com> wrote:
>>>>>
>>>>> On 9/19/22 12:53, Alper Gun wrote:
>>>>>> On Fri, Aug 19, 2022 at 9:54 AM Peter Gonda <pgonda@google.com> wrote:
>>>>>>>
>>>>>>>> +
>>>>>>>> +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu,
>>>>>>>> enum psc_op op, gpa_t gpa,
>>>>>>>> +                                         int level)
>>>>>>>> +{
>>>>>>>> +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
>>>>>>>> +       struct kvm *kvm = vcpu->kvm;
>>>>>>>> +       int rc, npt_level;
>>>>>>>> +       kvm_pfn_t pfn;
>>>>>>>> +       gpa_t gpa_end;
>>>>>>>> +
>>>>>>>> +       gpa_end = gpa + page_level_size(level);
>>>>>>>> +
>>>>>>>> +       while (gpa < gpa_end) {
>>>>>>>> +               /*
>>>>>>>> +                * If the gpa is not present in the NPT then
>>>>>>>> build the NPT.
>>>>>>>> +                */
>>>>>>>> +               rc = snp_check_and_build_npt(vcpu, gpa, level);
>>>>>>>> +               if (rc)
>>>>>>>> +                       return -EINVAL;
>>>>>>>> +
>>>>>>>> +               if (op == SNP_PAGE_STATE_PRIVATE) {
>>>>>>>> +                       hva_t hva;
>>>>>>>> +
>>>>>>>> +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
>>>>>>>> +                               return -EINVAL;
>>>>>>>> +
>>>>>>>> +                       /*
>>>>>>>> +                        * Verify that the hva range is
>>>>>>>> registered. This enforcement is
>>>>>>>> +                        * required to avoid the cases where a
>>>>>>>> page is marked private
>>>>>>>> +                        * in the RMP table but never gets
>>>>>>>> cleanup during the VM
>>>>>>>> +                        * termination path.
>>>>>>>> +                        */
>>>>>>>> +                       mutex_lock(&kvm->lock);
>>>>>>>> +                       rc = is_hva_registered(kvm, hva,
>>>>>>>> page_level_size(level));
>>>>>>>> +                       mutex_unlock(&kvm->lock);
>>>>>>>> +                       if (!rc)
>>>>>>>> +                               return -EINVAL;
>>>>>>>> +
>>>>>>>> +                       /*
>>>>>>>> +                        * Mark the userspace range unmerable
>>>>>>>> before adding the pages
>>>>>>>> +                        * in the RMP table.
>>>>>>>> +                        */
>>>>>>>> +                       mmap_write_lock(kvm->mm);
>>>>>>>> +                       rc = snp_mark_unmergable(kvm, hva,
>>>>>>>> page_level_size(level));
>>>>>>>> +                       mmap_write_unlock(kvm->mm);
>>>>>>>> +                       if (rc)
>>>>>>>> +                               return -EINVAL;
>>>>>>>> +               }
>>>>>>>> +
>>>>>>>> +               write_lock(&kvm->mmu_lock);
>>>>>>>> +
>>>>>>>> +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn,
>>>>>>>> &npt_level);
>>>>>>>> +               if (!rc) {
>>>>>>>> +                       /*
>>>>>>>> +                        * This may happen if another vCPU
>>>>>>>> unmapped the page
>>>>>>>> +                        * before we acquire the lock. Retry the
>>>>>>>> PSC.
>>>>>>>> +                        */
>>>>>>>> + write_unlock(&kvm->mmu_lock);
>>>>>>>> +                       return 0;
>>>>>>>> +               }
>>>>>>>
>>>>>>> I think we want to return -EAGAIN or similar if we want the caller to
>>>>>>> retry, right? I think returning 0 here hides the error.
>>>>>>>
>>>>>>
>>>>>> The problem here is that the caller(linux guest kernel) doesn't retry
>>>>>> if PSC fails. The current implementation in the guest kernel is that
>>>>>> if a page state change request fails, it terminates the VM with
>>>>>> GHCB_TERM_PSC reason.
>>>>>> Returning 0 here is not a good option because it will fail the PSC
>>>>>> silently and will probably cause a nested RMP fault later. Returning
>>>>>
>>>>> Returning 0 here is ok because the PSC current index into the PSC
>>>>> structure will not be updated and the guest will then retry (see the
>>>>> loop
>>>>> in vmgexit_psc() in arch/x86/kernel/sev.c).
>>>>>
>>>>> Thanks,
>>>>> Tom
>>>>
>>>> But the host code updates the index. It doesn't leave the loop because
>>>> rc is 0. The guest will think that it is successful.
>>>> rc = __snp_handle_page_state_change(vcpu, op, gpa, level);
>>>> if (rc)
>>>> goto out;
>>>>
>>>> Also the page state change request with MSR is not retried. It
>>>> terminates the VM if the MSR request fails.
>>>
>>> Ah, right. I see what you mean. It should probably return a -EAGAIN
>>> instead of 0 and then the if (rc) check should be modified to
>>> specifically look for -EAGAIN and goto out after setting rc to 0.
>>>
>>> But that does leave the MSR protocol open to the problem that you
>>> mention, so, yes, retry logic in snp_handle_page_state_change() for a
>>> -EAGAIN seems reasonable.
>>>
>>> Thanks,
>>> Tom
>>
>> I believe it makes more sense to add the retry logic within
>> __snp_handle_page_state_change() itself, as that will make it work for
>> both the GHCB MSR protocol and the GHCB VMGEXIT requests.
> 
> You are suggesting we just retry 'kvm_mmu_get_tdp_walk' inside of
> __snp_handle_page_state_change()? That should work but how many times
> do we retry? If we return EAGAIN or error we can leave it up to the
> caller
> 

Ok, we return -EAGAIN here and then let the caller in 
snp_handle_page_state_change() or sev_handle_vmgexit_msr_protocol()
(in case of GHCB MSR protocol) do the retries.

But, the question still remains, how may retry attempts should the 
caller attempt ?

Thanks,
Ashish

>>>
>>>>
>>>>>
>>>>>> an error also terminates the guest immediately with current guest
>>>>>> implementation. I think the best approach here is adding a retry logic
>>>>>> to this function. Retrying without returning an error should help it
>>>>>> work because snp_check_and_build_npt will be called again and in the
>>>>>> second attempt this should work.
>>>>>>
>>>>>>>> +
>>>>>>>> +               /*
>>>>>>>> +                * Adjust the level so that we don't go higher
>>>>>>>> than the backing
>>>>>>>> +                * page level.
>>>>>>>> +                */
>>>>>>>> +               level = min_t(size_t, level, npt_level);
>>>>>>>> +
>>>>>>>> +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op,
>>>>>>>> level);
>>>>>>>> +
>>>>>>>> +               switch (op) {
>>>>>>>> +               case SNP_PAGE_STATE_SHARED:
>>>>>>>> +                       rc = snp_make_page_shared(kvm, gpa, pfn,
>>>>>>>> level);
>>>>>>>> +                       break;
>>>>>>>> +               case SNP_PAGE_STATE_PRIVATE:
>>>>>>>> +                       rc = rmp_make_private(pfn, gpa, level,
>>>>>>>> sev->asid, false);
>>>>>>>> +                       break;
>>>>>>>> +               default:
>>>>>>>> +                       rc = -EINVAL;
>>>>>>>> +                       break;
>>>>>>>> +               }
>>>>>>>> +
>>>>>>>> +               write_unlock(&kvm->mmu_lock);
>>>>>>>> +
>>>>>>>> +               if (rc) {
>>>>>>>> +                       pr_err_ratelimited("Error op %d gpa %llx
>>>>>>>> pfn %llx level %d rc %d\n",
>>>>>>>> +                                          op, gpa, pfn, level, rc);
>>>>>>>> +                       return rc;
>>>>>>>> +               }
>>>>>>>> +
>>>>>>>> +               gpa = gpa + page_level_size(level);
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> +       return 0;
>>>>>>>> +}
>>>>>>>> +
Michael Roth Oct. 12, 2022, 10:57 p.m. UTC | #9
On Wed, Oct 12, 2022 at 03:15:15PM -0500, Kalra, Ashish wrote:
> On 9/26/2022 10:19 AM, Peter Gonda wrote:
> > On Mon, Sep 19, 2022 at 5:47 PM Ashish Kalra <ashkalra@amd.com> wrote:
> > > 
> > > 
> > > On 9/19/22 22:18, Tom Lendacky wrote:
> > > > On 9/19/22 17:02, Alper Gun wrote:
> > > > > On Mon, Sep 19, 2022 at 2:38 PM Tom Lendacky
> > > > > <thomas.lendacky@amd.com> wrote:
> > > > > > 
> > > > > > On 9/19/22 12:53, Alper Gun wrote:
> > > > > > > On Fri, Aug 19, 2022 at 9:54 AM Peter Gonda <pgonda@google.com> wrote:
> > > > > > > > 
> > > > > > > > > +
> > > > > > > > > +static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu,
> > > > > > > > > enum psc_op op, gpa_t gpa,
> > > > > > > > > +                                         int level)
> > > > > > > > > +{
> > > > > > > > > +       struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
> > > > > > > > > +       struct kvm *kvm = vcpu->kvm;
> > > > > > > > > +       int rc, npt_level;
> > > > > > > > > +       kvm_pfn_t pfn;
> > > > > > > > > +       gpa_t gpa_end;
> > > > > > > > > +
> > > > > > > > > +       gpa_end = gpa + page_level_size(level);
> > > > > > > > > +
> > > > > > > > > +       while (gpa < gpa_end) {
> > > > > > > > > +               /*
> > > > > > > > > +                * If the gpa is not present in the NPT then
> > > > > > > > > build the NPT.
> > > > > > > > > +                */
> > > > > > > > > +               rc = snp_check_and_build_npt(vcpu, gpa, level);
> > > > > > > > > +               if (rc)
> > > > > > > > > +                       return -EINVAL;
> > > > > > > > > +
> > > > > > > > > +               if (op == SNP_PAGE_STATE_PRIVATE) {
> > > > > > > > > +                       hva_t hva;
> > > > > > > > > +
> > > > > > > > > +                       if (snp_gpa_to_hva(kvm, gpa, &hva))
> > > > > > > > > +                               return -EINVAL;
> > > > > > > > > +
> > > > > > > > > +                       /*
> > > > > > > > > +                        * Verify that the hva range is
> > > > > > > > > registered. This enforcement is
> > > > > > > > > +                        * required to avoid the cases where a
> > > > > > > > > page is marked private
> > > > > > > > > +                        * in the RMP table but never gets
> > > > > > > > > cleanup during the VM
> > > > > > > > > +                        * termination path.
> > > > > > > > > +                        */
> > > > > > > > > +                       mutex_lock(&kvm->lock);
> > > > > > > > > +                       rc = is_hva_registered(kvm, hva,
> > > > > > > > > page_level_size(level));
> > > > > > > > > +                       mutex_unlock(&kvm->lock);
> > > > > > > > > +                       if (!rc)
> > > > > > > > > +                               return -EINVAL;
> > > > > > > > > +
> > > > > > > > > +                       /*
> > > > > > > > > +                        * Mark the userspace range unmerable
> > > > > > > > > before adding the pages
> > > > > > > > > +                        * in the RMP table.
> > > > > > > > > +                        */
> > > > > > > > > +                       mmap_write_lock(kvm->mm);
> > > > > > > > > +                       rc = snp_mark_unmergable(kvm, hva,
> > > > > > > > > page_level_size(level));
> > > > > > > > > +                       mmap_write_unlock(kvm->mm);
> > > > > > > > > +                       if (rc)
> > > > > > > > > +                               return -EINVAL;
> > > > > > > > > +               }
> > > > > > > > > +
> > > > > > > > > +               write_lock(&kvm->mmu_lock);
> > > > > > > > > +
> > > > > > > > > +               rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn,
> > > > > > > > > &npt_level);
> > > > > > > > > +               if (!rc) {
> > > > > > > > > +                       /*
> > > > > > > > > +                        * This may happen if another vCPU
> > > > > > > > > unmapped the page
> > > > > > > > > +                        * before we acquire the lock. Retry the
> > > > > > > > > PSC.
> > > > > > > > > +                        */
> > > > > > > > > + write_unlock(&kvm->mmu_lock);
> > > > > > > > > +                       return 0;
> > > > > > > > > +               }
> > > > > > > > 
> > > > > > > > I think we want to return -EAGAIN or similar if we want the caller to
> > > > > > > > retry, right? I think returning 0 here hides the error.
> > > > > > > > 
> > > > > > > 
> > > > > > > The problem here is that the caller(linux guest kernel) doesn't retry
> > > > > > > if PSC fails. The current implementation in the guest kernel is that
> > > > > > > if a page state change request fails, it terminates the VM with
> > > > > > > GHCB_TERM_PSC reason.
> > > > > > > Returning 0 here is not a good option because it will fail the PSC
> > > > > > > silently and will probably cause a nested RMP fault later. Returning
> > > > > > 
> > > > > > Returning 0 here is ok because the PSC current index into the PSC
> > > > > > structure will not be updated and the guest will then retry (see the
> > > > > > loop
> > > > > > in vmgexit_psc() in arch/x86/kernel/sev.c).
> > > > > > 
> > > > > > Thanks,
> > > > > > Tom
> > > > > 
> > > > > But the host code updates the index. It doesn't leave the loop because
> > > > > rc is 0. The guest will think that it is successful.
> > > > > rc = __snp_handle_page_state_change(vcpu, op, gpa, level);
> > > > > if (rc)
> > > > > goto out;
> > > > > 
> > > > > Also the page state change request with MSR is not retried. It
> > > > > terminates the VM if the MSR request fails.
> > > > 
> > > > Ah, right. I see what you mean. It should probably return a -EAGAIN
> > > > instead of 0 and then the if (rc) check should be modified to
> > > > specifically look for -EAGAIN and goto out after setting rc to 0.
> > > > 
> > > > But that does leave the MSR protocol open to the problem that you
> > > > mention, so, yes, retry logic in snp_handle_page_state_change() for a
> > > > -EAGAIN seems reasonable.
> > > > 
> > > > Thanks,
> > > > Tom
> > > 
> > > I believe it makes more sense to add the retry logic within
> > > __snp_handle_page_state_change() itself, as that will make it work for
> > > both the GHCB MSR protocol and the GHCB VMGEXIT requests.
> > 
> > You are suggesting we just retry 'kvm_mmu_get_tdp_walk' inside of
> > __snp_handle_page_state_change()? That should work but how many times
> > do we retry? If we return EAGAIN or error we can leave it up to the
> > caller
> > 
> 
> Ok, we return -EAGAIN here and then let the caller in
> snp_handle_page_state_change() or sev_handle_vmgexit_msr_protocol()
> (in case of GHCB MSR protocol) do the retries.
> 
> But, the question still remains, how may retry attempts should the caller
> attempt ?

With UPM I don't think we need to deal with this particular case, since we
don't need to walk the NPT to determine the PFN. The PSC will simply get
forward to userspace, and userspace will (generally):

 for shared->private:
   - deallocate page in shared pool
   - allocate page in private pool
   - issue KVM_MEM_ENCRYPT_REG_REGION on the GFN to switch it to private
     in the KVM xarray and RMP table (and zap current NPT mapping)
   - resume guest
   - guest faults on GFN and KVM MMU sees that it is private and maps the GFN
     to the corresponding PFN in the private pool, which should be reliably
     obtainable since it is pinned

 for private->shared:
   - deallocate page in private pool (which will switch it to shared in
     RMP table so it can be safely released back to host)
   - allocate page in shared pool
   - issue KVM_MEM_ENCRYPT_UNREG_REGION on the GFN to switch it to
     shared in the KVM xarray (and zap current NPT mapping)
   - resume guest
   - guest faults on GFN and KVM MMU sees that it is shared and handles it
     just like it would a normal non-SEV guest, so we don't ever need to
     acquire the specific PFN backing the HVA since they are implicitly
     shared, so no need anymore for kvm_mmu_get_tdp_walk() helper

     (also no need for pre-mapping into TDP via kvm_mmu_map_tdp_page() in
     this case, but not sure that was needed even without UPM, seems more like
     an optimization to avoid a 2nd #NPF. I guess we still have the option with
     UPM though if it seems justified, but it would likely happen in
     {REG,UNREG}_REGION in that case rather than SNP-specific hooks)

There may be some other edge cases to consider, but I'm not aware of any
sequences that aren't clearly misbehavior on the part of userspace/guest,
in which case terminating at the host/guest level seems reasonable.

-Mike

> 
> Thanks,
> Ashish
> 
> > > > 
> > > > > 
> > > > > > 
> > > > > > > an error also terminates the guest immediately with current guest
> > > > > > > implementation. I think the best approach here is adding a retry logic
> > > > > > > to this function. Retrying without returning an error should help it
> > > > > > > work because snp_check_and_build_npt will be called again and in the
> > > > > > > second attempt this should work.
> > > > > > > 
> > > > > > > > > +
> > > > > > > > > +               /*
> > > > > > > > > +                * Adjust the level so that we don't go higher
> > > > > > > > > than the backing
> > > > > > > > > +                * page level.
> > > > > > > > > +                */
> > > > > > > > > +               level = min_t(size_t, level, npt_level);
> > > > > > > > > +
> > > > > > > > > +               trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op,
> > > > > > > > > level);
> > > > > > > > > +
> > > > > > > > > +               switch (op) {
> > > > > > > > > +               case SNP_PAGE_STATE_SHARED:
> > > > > > > > > +                       rc = snp_make_page_shared(kvm, gpa, pfn,
> > > > > > > > > level);
> > > > > > > > > +                       break;
> > > > > > > > > +               case SNP_PAGE_STATE_PRIVATE:
> > > > > > > > > +                       rc = rmp_make_private(pfn, gpa, level,
> > > > > > > > > sev->asid, false);
> > > > > > > > > +                       break;
> > > > > > > > > +               default:
> > > > > > > > > +                       rc = -EINVAL;
> > > > > > > > > +                       break;
> > > > > > > > > +               }
> > > > > > > > > +
> > > > > > > > > +               write_unlock(&kvm->mmu_lock);
> > > > > > > > > +
> > > > > > > > > +               if (rc) {
> > > > > > > > > +                       pr_err_ratelimited("Error op %d gpa %llx
> > > > > > > > > pfn %llx level %d rc %d\n",
> > > > > > > > > +                                          op, gpa, pfn, level, rc);
> > > > > > > > > +                       return rc;
> > > > > > > > > +               }
> > > > > > > > > +
> > > > > > > > > +               gpa = gpa + page_level_size(level);
> > > > > > > > > +       }
> > > > > > > > > +
> > > > > > > > > +       return 0;
> > > > > > > > > +}
> > > > > > > > > +
diff mbox series

Patch

diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 0a9055cdfae2..ee38f7408470 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -93,6 +93,10 @@  enum psc_op {
 };
 
 #define GHCB_MSR_PSC_REQ		0x014
+#define GHCB_MSR_PSC_GFN_POS		12
+#define GHCB_MSR_PSC_GFN_MASK		GENMASK_ULL(39, 0)
+#define GHCB_MSR_PSC_OP_POS		52
+#define GHCB_MSR_PSC_OP_MASK		0xf
 #define GHCB_MSR_PSC_REQ_GFN(gfn, op)			\
 	/* GHCBData[55:52] */				\
 	(((u64)((op) & 0xf) << 52) |			\
@@ -102,6 +106,11 @@  enum psc_op {
 	GHCB_MSR_PSC_REQ)
 
 #define GHCB_MSR_PSC_RESP		0x015
+#define GHCB_MSR_PSC_ERROR_POS		32
+#define GHCB_MSR_PSC_ERROR_MASK		GENMASK_ULL(31, 0)
+#define GHCB_MSR_PSC_ERROR		GENMASK_ULL(31, 0)
+#define GHCB_MSR_PSC_RSVD_POS		12
+#define GHCB_MSR_PSC_RSVD_MASK		GENMASK_ULL(19, 0)
 #define GHCB_MSR_PSC_RESP_VAL(val)			\
 	/* GHCBData[63:32] */				\
 	(((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 6de48130e414..15900c2f30fc 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -32,6 +32,7 @@ 
 #include "svm_ops.h"
 #include "cpuid.h"
 #include "trace.h"
+#include "mmu.h"
 
 #ifndef CONFIG_KVM_AMD_SEV
 /*
@@ -3252,6 +3253,181 @@  static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
 	svm->vmcb->control.ghcb_gpa = value;
 }
 
+static int snp_rmptable_psmash(struct kvm *kvm, kvm_pfn_t pfn)
+{
+	pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
+
+	return psmash(pfn);
+}
+
+static int snp_make_page_shared(struct kvm *kvm, gpa_t gpa, kvm_pfn_t pfn, int level)
+{
+	int rc, rmp_level;
+
+	rc = snp_lookup_rmpentry(pfn, &rmp_level);
+	if (rc < 0)
+		return -EINVAL;
+
+	/* If page is not assigned then do nothing */
+	if (!rc)
+		return 0;
+
+	/*
+	 * Is the page part of an existing 2MB RMP entry ? Split the 2MB into
+	 * multiple of 4K-page before making the memory shared.
+	 */
+	if (level == PG_LEVEL_4K && rmp_level == PG_LEVEL_2M) {
+		rc = snp_rmptable_psmash(kvm, pfn);
+		if (rc)
+			return rc;
+	}
+
+	return rmp_make_shared(pfn, level);
+}
+
+static int snp_check_and_build_npt(struct kvm_vcpu *vcpu, gpa_t gpa, int level)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int rc, npt_level;
+	kvm_pfn_t pfn;
+
+	/*
+	 * Get the pfn and level for the gpa from the nested page table.
+	 *
+	 * If the tdp walk fails, then its safe to say that there is no
+	 * valid mapping for this gpa. Create a fault to build the map.
+	 */
+	write_lock(&kvm->mmu_lock);
+	rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level);
+	write_unlock(&kvm->mmu_lock);
+	if (!rc) {
+		pfn = kvm_mmu_map_tdp_page(vcpu, gpa, PFERR_USER_MASK, level);
+		if (is_error_noslot_pfn(pfn))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int snp_gpa_to_hva(struct kvm *kvm, gpa_t gpa, hva_t *hva)
+{
+	struct kvm_memory_slot *slot;
+	gfn_t gfn = gpa_to_gfn(gpa);
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	slot = gfn_to_memslot(kvm, gfn);
+	if (!slot) {
+		srcu_read_unlock(&kvm->srcu, idx);
+		return -EINVAL;
+	}
+
+	/*
+	 * Note, using the __gfn_to_hva_memslot() is not solely for performance,
+	 * it's also necessary to avoid the "writable" check in __gfn_to_hva_many(),
+	 * which will always fail on read-only memslots due to gfn_to_hva() assuming
+	 * writes.
+	 */
+	*hva = __gfn_to_hva_memslot(slot, gfn);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return 0;
+}
+
+static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op, gpa_t gpa,
+					  int level)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+	struct kvm *kvm = vcpu->kvm;
+	int rc, npt_level;
+	kvm_pfn_t pfn;
+	gpa_t gpa_end;
+
+	gpa_end = gpa + page_level_size(level);
+
+	while (gpa < gpa_end) {
+		/*
+		 * If the gpa is not present in the NPT then build the NPT.
+		 */
+		rc = snp_check_and_build_npt(vcpu, gpa, level);
+		if (rc)
+			return -EINVAL;
+
+		if (op == SNP_PAGE_STATE_PRIVATE) {
+			hva_t hva;
+
+			if (snp_gpa_to_hva(kvm, gpa, &hva))
+				return -EINVAL;
+
+			/*
+			 * Verify that the hva range is registered. This enforcement is
+			 * required to avoid the cases where a page is marked private
+			 * in the RMP table but never gets cleanup during the VM
+			 * termination path.
+			 */
+			mutex_lock(&kvm->lock);
+			rc = is_hva_registered(kvm, hva, page_level_size(level));
+			mutex_unlock(&kvm->lock);
+			if (!rc)
+				return -EINVAL;
+
+			/*
+			 * Mark the userspace range unmerable before adding the pages
+			 * in the RMP table.
+			 */
+			mmap_write_lock(kvm->mm);
+			rc = snp_mark_unmergable(kvm, hva, page_level_size(level));
+			mmap_write_unlock(kvm->mm);
+			if (rc)
+				return -EINVAL;
+		}
+
+		write_lock(&kvm->mmu_lock);
+
+		rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level);
+		if (!rc) {
+			/*
+			 * This may happen if another vCPU unmapped the page
+			 * before we acquire the lock. Retry the PSC.
+			 */
+			write_unlock(&kvm->mmu_lock);
+			return 0;
+		}
+
+		/*
+		 * Adjust the level so that we don't go higher than the backing
+		 * page level.
+		 */
+		level = min_t(size_t, level, npt_level);
+
+		trace_kvm_snp_psc(vcpu->vcpu_id, pfn, gpa, op, level);
+
+		switch (op) {
+		case SNP_PAGE_STATE_SHARED:
+			rc = snp_make_page_shared(kvm, gpa, pfn, level);
+			break;
+		case SNP_PAGE_STATE_PRIVATE:
+			rc = rmp_make_private(pfn, gpa, level, sev->asid, false);
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+
+		write_unlock(&kvm->mmu_lock);
+
+		if (rc) {
+			pr_err_ratelimited("Error op %d gpa %llx pfn %llx level %d rc %d\n",
+					   op, gpa, pfn, level, rc);
+			return rc;
+		}
+
+		gpa = gpa + page_level_size(level);
+	}
+
+	return 0;
+}
+
 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3352,6 +3528,27 @@  static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 				  GHCB_MSR_INFO_POS);
 		break;
 	}
+	case GHCB_MSR_PSC_REQ: {
+		gfn_t gfn;
+		int ret;
+		enum psc_op op;
+
+		gfn = get_ghcb_msr_bits(svm, GHCB_MSR_PSC_GFN_MASK, GHCB_MSR_PSC_GFN_POS);
+		op = get_ghcb_msr_bits(svm, GHCB_MSR_PSC_OP_MASK, GHCB_MSR_PSC_OP_POS);
+
+		ret = __snp_handle_page_state_change(vcpu, op, gfn_to_gpa(gfn), PG_LEVEL_4K);
+
+		if (ret)
+			set_ghcb_msr_bits(svm, GHCB_MSR_PSC_ERROR,
+					  GHCB_MSR_PSC_ERROR_MASK, GHCB_MSR_PSC_ERROR_POS);
+		else
+			set_ghcb_msr_bits(svm, 0,
+					  GHCB_MSR_PSC_ERROR_MASK, GHCB_MSR_PSC_ERROR_POS);
+
+		set_ghcb_msr_bits(svm, 0, GHCB_MSR_PSC_RSVD_MASK, GHCB_MSR_PSC_RSVD_POS);
+		set_ghcb_msr_bits(svm, GHCB_MSR_PSC_RESP, GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
+		break;
+	}
 	case GHCB_MSR_TERM_REQ: {
 		u64 reason_set, reason_code;
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 9b9bc5468103..79801e50344a 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -7,6 +7,7 @@ 
 #include <asm/svm.h>
 #include <asm/clocksource.h>
 #include <asm/pvclock-abi.h>
+#include <asm/sev-common.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
@@ -1755,6 +1756,39 @@  TRACE_EVENT(kvm_vmgexit_msr_protocol_exit,
 		  __entry->vcpu_id, __entry->ghcb_gpa, __entry->result)
 );
 
+/*
+ * Tracepoint for the SEV-SNP page state change processing
+ */
+#define psc_operation					\
+	{SNP_PAGE_STATE_PRIVATE, "private"},		\
+	{SNP_PAGE_STATE_SHARED,  "shared"}		\
+
+TRACE_EVENT(kvm_snp_psc,
+	TP_PROTO(unsigned int vcpu_id, u64 pfn, u64 gpa, u8 op, int level),
+	TP_ARGS(vcpu_id, pfn, gpa, op, level),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u64, pfn)
+		__field(u64, gpa)
+		__field(u8, op)
+		__field(int, level)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->pfn = pfn;
+		__entry->gpa = gpa;
+		__entry->op = op;
+		__entry->level = level;
+	),
+
+	TP_printk("vcpu %u, pfn %llx, gpa %llx, op %s, level %d",
+		  __entry->vcpu_id, __entry->pfn, __entry->gpa,
+		  __print_symbolic(__entry->op, psc_operation),
+		  __entry->level)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 50fff5202e7e..4a1d16231e30 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13066,6 +13066,7 @@  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_snp_psc);
 
 static int __init kvm_x86_init(void)
 {