diff mbox series

[v6,6/8] KVM: Handle page fault for private memory

Message ID 20220519153713.819591-7-chao.p.peng@linux.intel.com (mailing list archive)
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng May 19, 2022, 3:37 p.m. UTC
A page fault can carry the information of whether the access if private
or not for KVM_MEM_PRIVATE memslot, this can be filled by architecture
code(like TDX code). To handle page faut for such access, KVM maps the
page only when this private property matches host's view on this page
which can be decided by checking whether the corresponding page is
populated in the private fd or not. A page is considered as private when
the page is populated in the private fd, otherwise it's shared.

For a successful match, private pfn is obtained with memfile_notifier
callbacks from private fd and shared pfn is obtained with existing
get_user_pages.

For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
userspace. Userspace then can convert memory between private/shared from
host's view then retry the access.

Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 arch/x86/kvm/mmu.h              |  1 +
 arch/x86/kvm/mmu/mmu.c          | 70 +++++++++++++++++++++++++++++++--
 arch/x86/kvm/mmu/mmu_internal.h | 17 ++++++++
 arch/x86/kvm/mmu/mmutrace.h     |  1 +
 arch/x86/kvm/mmu/paging_tmpl.h  |  5 ++-
 include/linux/kvm_host.h        | 22 +++++++++++
 6 files changed, 112 insertions(+), 4 deletions(-)

Comments

Sean Christopherson June 17, 2022, 9:30 p.m. UTC | #1
On Thu, May 19, 2022, Chao Peng wrote:
> @@ -4028,8 +4081,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
>  	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
>  		return true;
>  
> -	return fault->slot &&
> -	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> +	if (fault->is_private)
> +		return mmu_notifier_retry(vcpu->kvm, mmu_seq);

Hmm, this is somewhat undesirable, because faulting in private pfns will be blocked
by unrelated mmu_notifier updates.  The issue is mitigated to some degree by bumping
the sequence count if and only if overlap with a memslot is detected, e.g. mapping
changes that affects only userspace won't block the guest.

It probably won't be an issue, but at the same time it's easy to solve, and I don't
like piggybacking mmu_notifier_seq as private mappings shouldn't be subject to the
mmu_notifier.

That would also fix a theoretical bug in this patch where mmu_notifier_retry()
wouldn't be defined if CONFIG_MEMFILE_NOTIFIER=y && CONFIG_MMU_NOTIFIER=n.a

---
 arch/x86/kvm/mmu/mmu.c   | 11 ++++++-----
 include/linux/kvm_host.h | 16 +++++++++++-----
 virt/kvm/kvm_main.c      |  2 +-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0b455c16ec64..a4cbd29433e7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4100,10 +4100,10 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
 		return true;

 	if (fault->is_private)
-		return mmu_notifier_retry(vcpu->kvm, mmu_seq);
-	else
-		return fault->slot &&
-			mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+		return memfile_notifier_retry(vcpu->kvm, mmu_seq);
+
+	return fault->slot &&
+	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
 }

 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -4127,7 +4127,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (r)
 		return r;

-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	mmu_seq = fault->is_private ? vcpu->kvm->memfile_notifier_seq :
+				      vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

 	r = kvm_faultin_pfn(vcpu, fault);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 92afa5bddbc5..31f704c83099 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -773,16 +773,15 @@ struct kvm {
 	struct hlist_head irq_ack_notifier_list;
 #endif

-#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) ||\
-	defined(CONFIG_MEMFILE_NOTIFIER)
+#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER))
 	unsigned long mmu_notifier_seq;
-#endif
-
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	struct mmu_notifier mmu_notifier;
 	long mmu_notifier_count;
 	unsigned long mmu_notifier_range_start;
 	unsigned long mmu_notifier_range_end;
+#endif
+#ifdef CONFIG_MEMFILE_NOTIFIER
+	unsigned long memfile_notifier_seq;
 #endif
 	struct list_head devices;
 	u64 manual_dirty_log_protect;
@@ -1964,6 +1963,13 @@ static inline int mmu_notifier_retry_hva(struct kvm *kvm,
 }
 #endif

+#ifdef CONFIG_MEMFILE_NOTIFIER
+static inline bool memfile_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
+{
+	return kvm->memfile_notifier_seq != mmu_seq;
+}
+#endif
+
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING

 #define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2b416d3bd60e..e6d34c964d51 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -898,7 +898,7 @@ static void kvm_private_mem_notifier_handler(struct memfile_notifier *notifier,
 	KVM_MMU_LOCK(kvm);
 	if (kvm_unmap_gfn_range(kvm, &gfn_range))
 		kvm_flush_remote_tlbs(kvm);
-	kvm->mmu_notifier_seq++;
+	kvm->memfile_notifier_seq++;
 	KVM_MMU_UNLOCK(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 }

base-commit: 333ef501c7f6c6d4ef2b7678905cad0f8ef3e271
--

> +	else
> +		return fault->slot &&
> +			mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
>  }
>  
>  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  		read_unlock(&vcpu->kvm->mmu_lock);
>  	else
>  		write_unlock(&vcpu->kvm->mmu_lock);
> -	kvm_release_pfn_clean(fault->pfn);
> +
> +	if (fault->is_private)
> +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);

Why does the shmem path lock the page, and then unlock it here?

Same question for why this path marks it dirty?  The guest has the page mapped
so the dirty flag is immediately stale.

In other words, why does KVM need to do something different for private pfns?

> +	else
> +		kvm_release_pfn_clean(fault->pfn);
> +
>  	return r;
>  }
>  

...

> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 7f8f1c8dbed2..1d857919a947 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -878,7 +878,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  
>  out_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
> -	kvm_release_pfn_clean(fault->pfn);
> +	if (fault->is_private)

Indirect MMUs can't support private faults, i.e. this is unnecessary.

> +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
> +	else
> +		kvm_release_pfn_clean(fault->pfn);
>  	return r;
>  }
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3fd168972ecd..b0a7910505ed 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2241,4 +2241,26 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
>  /* Max number of entries allowed for each kvm dirty ring */
>  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
>  
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
> +					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
> +{
> +	int ret;
> +	pfn_t pfnt;
> +	pgoff_t index = gfn - slot->base_gfn +
> +			(slot->private_offset >> PAGE_SHIFT);
> +
> +	ret = slot->notifier.bs->get_lock_pfn(slot->private_file, index, &pfnt,
> +						order);
> +	*pfn = pfn_t_to_pfn(pfnt);
> +	return ret;
> +}
> +
> +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
> +					   kvm_pfn_t pfn)
> +{
> +	slot->notifier.bs->put_unlock_pfn(pfn_to_pfn_t(pfn));
> +}
> +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
> +
>  #endif
> -- 
> 2.25.1
>
Chao Peng June 20, 2022, 2:16 p.m. UTC | #2
On Fri, Jun 17, 2022 at 09:30:53PM +0000, Sean Christopherson wrote:
> On Thu, May 19, 2022, Chao Peng wrote:
> > @@ -4028,8 +4081,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
> >  	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
> >  		return true;
> >  
> > -	return fault->slot &&
> > -	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> > +	if (fault->is_private)
> > +		return mmu_notifier_retry(vcpu->kvm, mmu_seq);
> 
> Hmm, this is somewhat undesirable, because faulting in private pfns will be blocked
> by unrelated mmu_notifier updates.  The issue is mitigated to some degree by bumping
> the sequence count if and only if overlap with a memslot is detected, e.g. mapping
> changes that affects only userspace won't block the guest.
> 
> It probably won't be an issue, but at the same time it's easy to solve, and I don't
> like piggybacking mmu_notifier_seq as private mappings shouldn't be subject to the
> mmu_notifier.
> 
> That would also fix a theoretical bug in this patch where mmu_notifier_retry()
> wouldn't be defined if CONFIG_MEMFILE_NOTIFIER=y && CONFIG_MMU_NOTIFIER=n.a

Agreed, Thanks.

> 
> ---
>  arch/x86/kvm/mmu/mmu.c   | 11 ++++++-----
>  include/linux/kvm_host.h | 16 +++++++++++-----
>  virt/kvm/kvm_main.c      |  2 +-
>  3 files changed, 18 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0b455c16ec64..a4cbd29433e7 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4100,10 +4100,10 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
>  		return true;
> 
>  	if (fault->is_private)
> -		return mmu_notifier_retry(vcpu->kvm, mmu_seq);
> -	else
> -		return fault->slot &&
> -			mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> +		return memfile_notifier_retry(vcpu->kvm, mmu_seq);
> +
> +	return fault->slot &&
> +	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
>  }
> 
>  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> @@ -4127,7 +4127,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	if (r)
>  		return r;
> 
> -	mmu_seq = vcpu->kvm->mmu_notifier_seq;
> +	mmu_seq = fault->is_private ? vcpu->kvm->memfile_notifier_seq :
> +				      vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
> 
>  	r = kvm_faultin_pfn(vcpu, fault);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 92afa5bddbc5..31f704c83099 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -773,16 +773,15 @@ struct kvm {
>  	struct hlist_head irq_ack_notifier_list;
>  #endif
> 
> -#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) ||\
> -	defined(CONFIG_MEMFILE_NOTIFIER)
> +#if (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER))
>  	unsigned long mmu_notifier_seq;
> -#endif
> -
> -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>  	struct mmu_notifier mmu_notifier;
>  	long mmu_notifier_count;
>  	unsigned long mmu_notifier_range_start;
>  	unsigned long mmu_notifier_range_end;
> +#endif
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +	unsigned long memfile_notifier_seq;
>  #endif
>  	struct list_head devices;
>  	u64 manual_dirty_log_protect;
> @@ -1964,6 +1963,13 @@ static inline int mmu_notifier_retry_hva(struct kvm *kvm,
>  }
>  #endif
> 
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +static inline bool memfile_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> +{
> +	return kvm->memfile_notifier_seq != mmu_seq;
> +}
> +#endif
> +
>  #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> 
>  #define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2b416d3bd60e..e6d34c964d51 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -898,7 +898,7 @@ static void kvm_private_mem_notifier_handler(struct memfile_notifier *notifier,
>  	KVM_MMU_LOCK(kvm);
>  	if (kvm_unmap_gfn_range(kvm, &gfn_range))
>  		kvm_flush_remote_tlbs(kvm);
> -	kvm->mmu_notifier_seq++;
> +	kvm->memfile_notifier_seq++;
>  	KVM_MMU_UNLOCK(kvm);
>  	srcu_read_unlock(&kvm->srcu, idx);
>  }
> 
> base-commit: 333ef501c7f6c6d4ef2b7678905cad0f8ef3e271
> --
> 
> > +	else
> > +		return fault->slot &&
> > +			mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> >  }
> >  
> >  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> >  		read_unlock(&vcpu->kvm->mmu_lock);
> >  	else
> >  		write_unlock(&vcpu->kvm->mmu_lock);
> > -	kvm_release_pfn_clean(fault->pfn);
> > +
> > +	if (fault->is_private)
> > +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
> 
> Why does the shmem path lock the page, and then unlock it here?

Initially this is to prevent race between SLPT population and
truncate/punch on the fd. Without this, a gfn may become stale before
the page is populated in SLPT. However, with memfile_notifier_retry
mechanism, this sounds not needed.

> 
> Same question for why this path marks it dirty?  The guest has the page mapped
> so the dirty flag is immediately stale.

I believe so.

> 
> In other words, why does KVM need to do something different for private pfns?

These two are inherited from Kirill's previous code. See if he has any
comment.

> 
> > +	else
> > +		kvm_release_pfn_clean(fault->pfn);
> > +
> >  	return r;
> >  }
> >  
> 
> ...
> 
> > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> > index 7f8f1c8dbed2..1d857919a947 100644
> > --- a/arch/x86/kvm/mmu/paging_tmpl.h
> > +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> > @@ -878,7 +878,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> >  
> >  out_unlock:
> >  	write_unlock(&vcpu->kvm->mmu_lock);
> > -	kvm_release_pfn_clean(fault->pfn);
> > +	if (fault->is_private)
> 
> Indirect MMUs can't support private faults, i.e. this is unnecessary.

Okay.

> 
> > +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
> > +	else
> > +		kvm_release_pfn_clean(fault->pfn);
> >  	return r;
> >  }
> >  
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 3fd168972ecd..b0a7910505ed 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -2241,4 +2241,26 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
> >  /* Max number of entries allowed for each kvm dirty ring */
> >  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
> >  
> > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > +static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
> > +					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
> > +{
> > +	int ret;
> > +	pfn_t pfnt;
> > +	pgoff_t index = gfn - slot->base_gfn +
> > +			(slot->private_offset >> PAGE_SHIFT);
> > +
> > +	ret = slot->notifier.bs->get_lock_pfn(slot->private_file, index, &pfnt,
> > +						order);
> > +	*pfn = pfn_t_to_pfn(pfnt);
> > +	return ret;
> > +}
> > +
> > +static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
> > +					   kvm_pfn_t pfn)
> > +{
> > +	slot->notifier.bs->put_unlock_pfn(pfn_to_pfn_t(pfn));
> > +}
> > +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
> > +
> >  #endif
> > -- 
> > 2.25.1
> >
Nikunj A. Dadhania June 24, 2022, 3:58 a.m. UTC | #3
On 5/19/2022 9:07 PM, Chao Peng wrote:
> A page fault can carry the information of whether the access if private
> or not for KVM_MEM_PRIVATE memslot, this can be filled by architecture
> code(like TDX code). To handle page faut for such access, KVM maps the
> page only when this private property matches host's view on this page
> which can be decided by checking whether the corresponding page is
> populated in the private fd or not. A page is considered as private when
> the page is populated in the private fd, otherwise it's shared.
> 
> For a successful match, private pfn is obtained with memfile_notifier
> callbacks from private fd and shared pfn is obtained with existing
> get_user_pages.
> 
> For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
> userspace. Userspace then can convert memory between private/shared from
> host's view then retry the access.
> 
> Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  arch/x86/kvm/mmu.h              |  1 +
>  arch/x86/kvm/mmu/mmu.c          | 70 +++++++++++++++++++++++++++++++--
>  arch/x86/kvm/mmu/mmu_internal.h | 17 ++++++++
>  arch/x86/kvm/mmu/mmutrace.h     |  1 +
>  arch/x86/kvm/mmu/paging_tmpl.h  |  5 ++-
>  include/linux/kvm_host.h        | 22 +++++++++++
>  6 files changed, 112 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 7e258cc94152..c84835762249 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -176,6 +176,7 @@ struct kvm_page_fault {
>  
>  	/* Derived from mmu and global state.  */
>  	const bool is_tdp;
> +	const bool is_private;
>  	const bool nx_huge_page_workaround_enabled;
>  
>  	/*
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index afe18d70ece7..e18460e0d743 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2899,6 +2899,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> +	if (kvm_slot_is_private(slot))
> +		return max_level;

Can you explain the rationale behind the above change? 
AFAIU, this overrides the transparent_hugepage=never setting for both 
shared and private mappings.

>  	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
>  	return min(host_level, max_level);
>  }

Regards
Nikunj
Chao Peng June 24, 2022, 9:02 a.m. UTC | #4
On Fri, Jun 24, 2022 at 09:28:23AM +0530, Nikunj A. Dadhania wrote:
> On 5/19/2022 9:07 PM, Chao Peng wrote:
> > A page fault can carry the information of whether the access if private
> > or not for KVM_MEM_PRIVATE memslot, this can be filled by architecture
> > code(like TDX code). To handle page faut for such access, KVM maps the
> > page only when this private property matches host's view on this page
> > which can be decided by checking whether the corresponding page is
> > populated in the private fd or not. A page is considered as private when
> > the page is populated in the private fd, otherwise it's shared.
> > 
> > For a successful match, private pfn is obtained with memfile_notifier
> > callbacks from private fd and shared pfn is obtained with existing
> > get_user_pages.
> > 
> > For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
> > userspace. Userspace then can convert memory between private/shared from
> > host's view then retry the access.
> > 
> > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> >  arch/x86/kvm/mmu.h              |  1 +
> >  arch/x86/kvm/mmu/mmu.c          | 70 +++++++++++++++++++++++++++++++--
> >  arch/x86/kvm/mmu/mmu_internal.h | 17 ++++++++
> >  arch/x86/kvm/mmu/mmutrace.h     |  1 +
> >  arch/x86/kvm/mmu/paging_tmpl.h  |  5 ++-
> >  include/linux/kvm_host.h        | 22 +++++++++++
> >  6 files changed, 112 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> > index 7e258cc94152..c84835762249 100644
> > --- a/arch/x86/kvm/mmu.h
> > +++ b/arch/x86/kvm/mmu.h
> > @@ -176,6 +176,7 @@ struct kvm_page_fault {
> >  
> >  	/* Derived from mmu and global state.  */
> >  	const bool is_tdp;
> > +	const bool is_private;
> >  	const bool nx_huge_page_workaround_enabled;
> >  
> >  	/*
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index afe18d70ece7..e18460e0d743 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -2899,6 +2899,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> >  	if (max_level == PG_LEVEL_4K)
> >  		return PG_LEVEL_4K;
> >  
> > +	if (kvm_slot_is_private(slot))
> > +		return max_level;
> 
> Can you explain the rationale behind the above change? 
> AFAIU, this overrides the transparent_hugepage=never setting for both 
> shared and private mappings.

As Sean pointed out, this should check against fault->is_private instead
of the slot. For private fault, the level is retrieved and stored to
fault->max_level in kvm_faultin_pfn_private() instead of here.

For shared fault, it will continue to query host_level below. For
private fault, the host level has already been accounted in
kvm_faultin_pfn_private().

Chao
> 
> >  	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
> >  	return min(host_level, max_level);
> >  }
> 
> Regards
> Nikunj
Vishal Annapurve June 30, 2022, 7:14 p.m. UTC | #5
...
> > >     /*
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index afe18d70ece7..e18460e0d743 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -2899,6 +2899,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> > >     if (max_level == PG_LEVEL_4K)
> > >             return PG_LEVEL_4K;
> > >
> > > +   if (kvm_slot_is_private(slot))
> > > +           return max_level;
> >
> > Can you explain the rationale behind the above change?
> > AFAIU, this overrides the transparent_hugepage=never setting for both
> > shared and private mappings.
>
> As Sean pointed out, this should check against fault->is_private instead
> of the slot. For private fault, the level is retrieved and stored to
> fault->max_level in kvm_faultin_pfn_private() instead of here.
>
> For shared fault, it will continue to query host_level below. For
> private fault, the host level has already been accounted in
> kvm_faultin_pfn_private().
>
> Chao
> >

With transparent_hugepages=always setting I see issues with the
current implementation.

Scenario:
1) Guest accesses a gfn range 0x800-0xa00 as private
2) Guest calls mapgpa to convert the range 0x84d-0x86e as shared
3) Guest tries to access recently converted memory as shared for the first time
Guest VM shutdown is observed after step 3 -> Guest is unable to
proceed further since somehow code section is not as expected

Corresponding KVM trace logs after step 3:
VCPU-0-61883   [078] ..... 72276.115679: kvm_page_fault: address
84d000 error_code 4
VCPU-0-61883   [078] ..... 72276.127005: kvm_mmu_spte_requested: gfn
84d pfn 100b4a4d level 2
VCPU-0-61883   [078] ..... 72276.127008: kvm_tdp_mmu_spte_changed: as
id 0 gfn 800 level 2 old_spte 100b1b16827 new_spte 100b4a00ea7
VCPU-0-61883   [078] ..... 72276.127009: kvm_mmu_prepare_zap_page: sp
gen 0 gfn 800 l1 8-byte q0 direct wux nxe ad root 0 sync
VCPU-0-61883   [078] ..... 72276.127009: kvm_tdp_mmu_spte_changed: as
id 0 gfn 800 level 1 old_spte 1003eb27e67 new_spte 5a0
VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
id 0 gfn 801 level 1 old_spte 10056cc8e67 new_spte 5a0
VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
id 0 gfn 802 level 1 old_spte 10056fa2e67 new_spte 5a0
VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
id 0 gfn 803 level 1 old_spte 0 new_spte 5a0
....
 VCPU-0-61883   [078] ..... 72276.127089: kvm_tdp_mmu_spte_changed: as
id 0 gfn 9ff level 1 old_spte 100a43f4e67 new_spte 5a0
 VCPU-0-61883   [078] ..... 72276.127090: kvm_mmu_set_spte: gfn 800
spte 100b4a00ea7 (rwxu) level 2 at 10052fa5020
 VCPU-0-61883   [078] ..... 72276.127091: kvm_fpu: unload

Looks like with transparent huge pages enabled kvm tried to handle the
shared memory fault on 0x84d gfn by coalescing nearby 4K pages
to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was
requested in kvm_mmu_spte_requested.
This caused the private memory contents from regions 0x800-0x84c and
0x86e-0xa00 to get unmapped from the guest leading to guest vm
shutdown.

Does getting the mapping level as per the fault access type help
address the above issue? Any such coalescing should not cross between
private to
shared or shared to private memory regions.

> > >     host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
> > >     return min(host_level, max_level);
> > >  }
> >

Regards,
Vishal
Michael Roth June 30, 2022, 10:21 p.m. UTC | #6
On Thu, Jun 30, 2022 at 12:14:13PM -0700, Vishal Annapurve wrote:
> With transparent_hugepages=always setting I see issues with the
> current implementation.
> 
> Scenario:
> 1) Guest accesses a gfn range 0x800-0xa00 as private
> 2) Guest calls mapgpa to convert the range 0x84d-0x86e as shared
> 3) Guest tries to access recently converted memory as shared for the first time
> Guest VM shutdown is observed after step 3 -> Guest is unable to
> proceed further since somehow code section is not as expected
> 
> Corresponding KVM trace logs after step 3:
> VCPU-0-61883   [078] ..... 72276.115679: kvm_page_fault: address
> 84d000 error_code 4
> VCPU-0-61883   [078] ..... 72276.127005: kvm_mmu_spte_requested: gfn
> 84d pfn 100b4a4d level 2
> VCPU-0-61883   [078] ..... 72276.127008: kvm_tdp_mmu_spte_changed: as
> id 0 gfn 800 level 2 old_spte 100b1b16827 new_spte 100b4a00ea7
> VCPU-0-61883   [078] ..... 72276.127009: kvm_mmu_prepare_zap_page: sp
> gen 0 gfn 800 l1 8-byte q0 direct wux nxe ad root 0 sync
> VCPU-0-61883   [078] ..... 72276.127009: kvm_tdp_mmu_spte_changed: as
> id 0 gfn 800 level 1 old_spte 1003eb27e67 new_spte 5a0
> VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
> id 0 gfn 801 level 1 old_spte 10056cc8e67 new_spte 5a0
> VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
> id 0 gfn 802 level 1 old_spte 10056fa2e67 new_spte 5a0
> VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
> id 0 gfn 803 level 1 old_spte 0 new_spte 5a0
> ....
>  VCPU-0-61883   [078] ..... 72276.127089: kvm_tdp_mmu_spte_changed: as
> id 0 gfn 9ff level 1 old_spte 100a43f4e67 new_spte 5a0
>  VCPU-0-61883   [078] ..... 72276.127090: kvm_mmu_set_spte: gfn 800
> spte 100b4a00ea7 (rwxu) level 2 at 10052fa5020
>  VCPU-0-61883   [078] ..... 72276.127091: kvm_fpu: unload
> 
> Looks like with transparent huge pages enabled kvm tried to handle the
> shared memory fault on 0x84d gfn by coalescing nearby 4K pages
> to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was
> requested in kvm_mmu_spte_requested.
> This caused the private memory contents from regions 0x800-0x84c and
> 0x86e-0xa00 to get unmapped from the guest leading to guest vm
> shutdown.

Interesting... seems like that wouldn't be an issue for non-UPM SEV, since
the private pages would still be mapped as part of that 2M mapping, and
it's completely up to the guest as to whether it wants to access as
private or shared. But for UPM it makes sense this would cause issues.

> 
> Does getting the mapping level as per the fault access type help
> address the above issue? Any such coalescing should not cross between
> private to
> shared or shared to private memory regions.

Doesn't seem like changing the check to fault->is_private would help in
your particular case, since the subsequent host_pfn_mapping_level() call
only seems to limit the mapping level to whatever the mapping level is
for the HVA in the host page table.

Seems like with UPM we need some additional handling here that also
checks that the entire 2M HVA range is backed by non-private memory.

Non-UPM SNP hypervisor patches already have a similar hook added to
host_pfn_mapping_level() which implements such a check via RMP table, so
UPM might need something similar:

  https://github.com/AMDESE/linux/commit/ae4475bc740eb0b9d031a76412b0117339794139

-Mike

> 
> > > >     host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
> > > >     return min(host_level, max_level);
> > > >  }
> > >
> 
> Regards,
> Vishal
Xiaoyao Li July 1, 2022, 1:21 a.m. UTC | #7
On 7/1/2022 6:21 AM, Michael Roth wrote:
> On Thu, Jun 30, 2022 at 12:14:13PM -0700, Vishal Annapurve wrote:
>> With transparent_hugepages=always setting I see issues with the
>> current implementation.
>>
>> Scenario:
>> 1) Guest accesses a gfn range 0x800-0xa00 as private
>> 2) Guest calls mapgpa to convert the range 0x84d-0x86e as shared
>> 3) Guest tries to access recently converted memory as shared for the first time
>> Guest VM shutdown is observed after step 3 -> Guest is unable to
>> proceed further since somehow code section is not as expected
>>
>> Corresponding KVM trace logs after step 3:
>> VCPU-0-61883   [078] ..... 72276.115679: kvm_page_fault: address
>> 84d000 error_code 4
>> VCPU-0-61883   [078] ..... 72276.127005: kvm_mmu_spte_requested: gfn
>> 84d pfn 100b4a4d level 2
>> VCPU-0-61883   [078] ..... 72276.127008: kvm_tdp_mmu_spte_changed: as
>> id 0 gfn 800 level 2 old_spte 100b1b16827 new_spte 100b4a00ea7
>> VCPU-0-61883   [078] ..... 72276.127009: kvm_mmu_prepare_zap_page: sp
>> gen 0 gfn 800 l1 8-byte q0 direct wux nxe ad root 0 sync
>> VCPU-0-61883   [078] ..... 72276.127009: kvm_tdp_mmu_spte_changed: as
>> id 0 gfn 800 level 1 old_spte 1003eb27e67 new_spte 5a0
>> VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
>> id 0 gfn 801 level 1 old_spte 10056cc8e67 new_spte 5a0
>> VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
>> id 0 gfn 802 level 1 old_spte 10056fa2e67 new_spte 5a0
>> VCPU-0-61883   [078] ..... 72276.127010: kvm_tdp_mmu_spte_changed: as
>> id 0 gfn 803 level 1 old_spte 0 new_spte 5a0
>> ....
>>   VCPU-0-61883   [078] ..... 72276.127089: kvm_tdp_mmu_spte_changed: as
>> id 0 gfn 9ff level 1 old_spte 100a43f4e67 new_spte 5a0
>>   VCPU-0-61883   [078] ..... 72276.127090: kvm_mmu_set_spte: gfn 800
>> spte 100b4a00ea7 (rwxu) level 2 at 10052fa5020
>>   VCPU-0-61883   [078] ..... 72276.127091: kvm_fpu: unload
>>
>> Looks like with transparent huge pages enabled kvm tried to handle the
>> shared memory fault on 0x84d gfn by coalescing nearby 4K pages
>> to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was
>> requested in kvm_mmu_spte_requested.
>> This caused the private memory contents from regions 0x800-0x84c and
>> 0x86e-0xa00 to get unmapped from the guest leading to guest vm
>> shutdown.
> 
> Interesting... seems like that wouldn't be an issue for non-UPM SEV, since
> the private pages would still be mapped as part of that 2M mapping, and
> it's completely up to the guest as to whether it wants to access as
> private or shared. But for UPM it makes sense this would cause issues.
> 
>>
>> Does getting the mapping level as per the fault access type help
>> address the above issue? Any such coalescing should not cross between
>> private to
>> shared or shared to private memory regions.
> 
> Doesn't seem like changing the check to fault->is_private would help in
> your particular case, since the subsequent host_pfn_mapping_level() call
> only seems to limit the mapping level to whatever the mapping level is
> for the HVA in the host page table.
> 
> Seems like with UPM we need some additional handling here that also
> checks that the entire 2M HVA range is backed by non-private memory.
> 
> Non-UPM SNP hypervisor patches already have a similar hook added to
> host_pfn_mapping_level() which implements such a check via RMP table, so
> UPM might need something similar:
> 
>    https://github.com/AMDESE/linux/commit/ae4475bc740eb0b9d031a76412b0117339794139
> 
> -Mike
> 

For TDX, we try to track the page type (shared, private, mixed) of each 
gfn at given level. Only when the type is shared/private, can it be 
mapped at that level. When it's mixed, i.e., it contains both shared 
pages and private pages at given level, it has to go to next smaller level.

https://github.com/intel/tdx/commit/ed97f4042eb69a210d9e972ccca6a84234028cad
Sean Christopherson July 7, 2022, 8:08 p.m. UTC | #8
On Fri, Jul 01, 2022, Xiaoyao Li wrote:
> On 7/1/2022 6:21 AM, Michael Roth wrote:
> > On Thu, Jun 30, 2022 at 12:14:13PM -0700, Vishal Annapurve wrote:
> > > With transparent_hugepages=always setting I see issues with the
> > > current implementation.

...

> > > Looks like with transparent huge pages enabled kvm tried to handle the
> > > shared memory fault on 0x84d gfn by coalescing nearby 4K pages
> > > to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was
> > > requested in kvm_mmu_spte_requested.
> > > This caused the private memory contents from regions 0x800-0x84c and
> > > 0x86e-0xa00 to get unmapped from the guest leading to guest vm
> > > shutdown.
> > 
> > Interesting... seems like that wouldn't be an issue for non-UPM SEV, since
> > the private pages would still be mapped as part of that 2M mapping, and
> > it's completely up to the guest as to whether it wants to access as
> > private or shared. But for UPM it makes sense this would cause issues.
> > 
> > > 
> > > Does getting the mapping level as per the fault access type help
> > > address the above issue? Any such coalescing should not cross between
> > > private to
> > > shared or shared to private memory regions.
> > 
> > Doesn't seem like changing the check to fault->is_private would help in
> > your particular case, since the subsequent host_pfn_mapping_level() call
> > only seems to limit the mapping level to whatever the mapping level is
> > for the HVA in the host page table.
> > 
> > Seems like with UPM we need some additional handling here that also
> > checks that the entire 2M HVA range is backed by non-private memory.
> > 
> > Non-UPM SNP hypervisor patches already have a similar hook added to
> > host_pfn_mapping_level() which implements such a check via RMP table, so
> > UPM might need something similar:
> > 
> >    https://github.com/AMDESE/linux/commit/ae4475bc740eb0b9d031a76412b0117339794139
> > 
> > -Mike
> > 
> 
> For TDX, we try to track the page type (shared, private, mixed) of each gfn
> at given level. Only when the type is shared/private, can it be mapped at
> that level. When it's mixed, i.e., it contains both shared pages and private
> pages at given level, it has to go to next smaller level.
> 
> https://github.com/intel/tdx/commit/ed97f4042eb69a210d9e972ccca6a84234028cad

Hmm, so a new slot->arch.page_attr array shouldn't be necessary, KVM can instead
update slot->arch.lpage_info on shared<->private conversions.  Detecting whether
a given range is partially mapped could get nasty if KVM defers tracking to the
backing store, but if KVM itself does the tracking as was previously suggested[*],
then updating lpage_info should be relatively straightfoward, e.g. use
xa_for_each_range() to see if a given 2mb/1gb range is completely covered (fully
shared) or not covered at all (fully private).

[*] https://lore.kernel.org/all/YofeZps9YXgtP3f1@google.com
Xiaoyao Li July 8, 2022, 3:29 a.m. UTC | #9
On 7/8/2022 4:08 AM, Sean Christopherson wrote:
> On Fri, Jul 01, 2022, Xiaoyao Li wrote:
>> On 7/1/2022 6:21 AM, Michael Roth wrote:
>>> On Thu, Jun 30, 2022 at 12:14:13PM -0700, Vishal Annapurve wrote:
>>>> With transparent_hugepages=always setting I see issues with the
>>>> current implementation.
> 
> ...
> 
>>>> Looks like with transparent huge pages enabled kvm tried to handle the
>>>> shared memory fault on 0x84d gfn by coalescing nearby 4K pages
>>>> to form a contiguous 2MB page mapping at gfn 0x800, since level 2 was
>>>> requested in kvm_mmu_spte_requested.
>>>> This caused the private memory contents from regions 0x800-0x84c and
>>>> 0x86e-0xa00 to get unmapped from the guest leading to guest vm
>>>> shutdown.
>>>
>>> Interesting... seems like that wouldn't be an issue for non-UPM SEV, since
>>> the private pages would still be mapped as part of that 2M mapping, and
>>> it's completely up to the guest as to whether it wants to access as
>>> private or shared. But for UPM it makes sense this would cause issues.
>>>
>>>>
>>>> Does getting the mapping level as per the fault access type help
>>>> address the above issue? Any such coalescing should not cross between
>>>> private to
>>>> shared or shared to private memory regions.
>>>
>>> Doesn't seem like changing the check to fault->is_private would help in
>>> your particular case, since the subsequent host_pfn_mapping_level() call
>>> only seems to limit the mapping level to whatever the mapping level is
>>> for the HVA in the host page table.
>>>
>>> Seems like with UPM we need some additional handling here that also
>>> checks that the entire 2M HVA range is backed by non-private memory.
>>>
>>> Non-UPM SNP hypervisor patches already have a similar hook added to
>>> host_pfn_mapping_level() which implements such a check via RMP table, so
>>> UPM might need something similar:
>>>
>>>     https://github.com/AMDESE/linux/commit/ae4475bc740eb0b9d031a76412b0117339794139
>>>
>>> -Mike
>>>
>>
>> For TDX, we try to track the page type (shared, private, mixed) of each gfn
>> at given level. Only when the type is shared/private, can it be mapped at
>> that level. When it's mixed, i.e., it contains both shared pages and private
>> pages at given level, it has to go to next smaller level.
>>
>> https://github.com/intel/tdx/commit/ed97f4042eb69a210d9e972ccca6a84234028cad
> 
> Hmm, so a new slot->arch.page_attr array shouldn't be necessary, KVM can instead
> update slot->arch.lpage_info on shared<->private conversions.  Detecting whether
> a given range is partially mapped could get nasty if KVM defers tracking to the
> backing store, but if KVM itself does the tracking as was previously suggested[*],
> then updating lpage_info should be relatively straightfoward, e.g. use
> xa_for_each_range() to see if a given 2mb/1gb range is completely covered (fully
> shared) or not covered at all (fully private).
> 
> [*] https://lore.kernel.org/all/YofeZps9YXgtP3f1@google.com

Yes, slot->arch.page_attr was introduced to help identify whether a page 
is completely shared/private at given level. It seems XARRAY can serve 
the same purpose, though I know nothing about it. Looking forward to 
seeing the patch of using XARRAY.

yes, update slot->arch.lpage_info is good to utilize the existing logic 
and Isaku has applied it to slot->arch.lpage_info for 2MB support patches.
Vishal Annapurve July 20, 2022, 11:08 p.m. UTC | #10
> > Hmm, so a new slot->arch.page_attr array shouldn't be necessary, KVM can instead
> > update slot->arch.lpage_info on shared<->private conversions.  Detecting whether
> > a given range is partially mapped could get nasty if KVM defers tracking to the
> > backing store, but if KVM itself does the tracking as was previously suggested[*],
> > then updating lpage_info should be relatively straightfoward, e.g. use
> > xa_for_each_range() to see if a given 2mb/1gb range is completely covered (fully
> > shared) or not covered at all (fully private).
> >
> > [*] https://lore.kernel.org/all/YofeZps9YXgtP3f1@google.com
>
> Yes, slot->arch.page_attr was introduced to help identify whether a page
> is completely shared/private at given level. It seems XARRAY can serve
> the same purpose, though I know nothing about it. Looking forward to
> seeing the patch of using XARRAY.
>
> yes, update slot->arch.lpage_info is good to utilize the existing logic
> and Isaku has applied it to slot->arch.lpage_info for 2MB support patches.

Chao, are you planning to implement these changes to ensure proper
handling of hugepages partially mapped as private/shared in subsequent
versions of this series?
Or is this something left to be handled by the architecture specific code?

Regards,
Vishal
Chao Peng July 21, 2022, 9:45 a.m. UTC | #11
On Wed, Jul 20, 2022 at 04:08:10PM -0700, Vishal Annapurve wrote:
> > > Hmm, so a new slot->arch.page_attr array shouldn't be necessary, KVM can instead
> > > update slot->arch.lpage_info on shared<->private conversions.  Detecting whether
> > > a given range is partially mapped could get nasty if KVM defers tracking to the
> > > backing store, but if KVM itself does the tracking as was previously suggested[*],
> > > then updating lpage_info should be relatively straightfoward, e.g. use
> > > xa_for_each_range() to see if a given 2mb/1gb range is completely covered (fully
> > > shared) or not covered at all (fully private).
> > >
> > > [*] https://lore.kernel.org/all/YofeZps9YXgtP3f1@google.com
> >
> > Yes, slot->arch.page_attr was introduced to help identify whether a page
> > is completely shared/private at given level. It seems XARRAY can serve
> > the same purpose, though I know nothing about it. Looking forward to
> > seeing the patch of using XARRAY.
> >
> > yes, update slot->arch.lpage_info is good to utilize the existing logic
> > and Isaku has applied it to slot->arch.lpage_info for 2MB support patches.
> 
> Chao, are you planning to implement these changes to ensure proper
> handling of hugepages partially mapped as private/shared in subsequent
> versions of this series?
> Or is this something left to be handled by the architecture specific code?

Ah, the topic gets moved to a different place. I should update here.
There were more discussions under TDX KVM patch series and I actually
just sent out the draft code for this:

https://lkml.org/lkml/2022/7/20/610

That patch is based on UPM v7 here. If I can get more feedbacks there
then I will include an udpated version in UPM v8.

If you have bandwdith, you can also play with that patch, any feedback
is welcome.

Chao
> 
> Regards,
> Vishal
Kirill A . Shutemov Aug. 19, 2022, 12:40 a.m. UTC | #12
On Fri, Jun 17, 2022 at 09:30:53PM +0000, Sean Christopherson wrote:
> > @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> >  		read_unlock(&vcpu->kvm->mmu_lock);
> >  	else
> >  		write_unlock(&vcpu->kvm->mmu_lock);
> > -	kvm_release_pfn_clean(fault->pfn);
> > +
> > +	if (fault->is_private)
> > +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
> 
> Why does the shmem path lock the page, and then unlock it here?

Lock is require to avoid race with truncate / punch hole. Like if truncate
happens after get_pfn(), but before it gets into SEPT we are screwed.

> Same question for why this path marks it dirty?  The guest has the page mapped
> so the dirty flag is immediately stale.

If page is clean and refcount is not elevated, vmscan is free to drop the
page from page cache. I don't think we want this.

> In other words, why does KVM need to do something different for private pfns?

Because in the traditional KVM memslot scheme, core mm takes care about
this.

The changes in v7 is wrong. Page has be locked until it lends into SEPT and
must make it dirty before unlocking.
Sean Christopherson Aug. 25, 2022, 11:43 p.m. UTC | #13
On Fri, Aug 19, 2022, Kirill A. Shutemov wrote:
> On Fri, Jun 17, 2022 at 09:30:53PM +0000, Sean Christopherson wrote:
> > > @@ -4088,7 +4144,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> > >  		read_unlock(&vcpu->kvm->mmu_lock);
> > >  	else
> > >  		write_unlock(&vcpu->kvm->mmu_lock);
> > > -	kvm_release_pfn_clean(fault->pfn);
> > > +
> > > +	if (fault->is_private)
> > > +		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
> > 
> > Why does the shmem path lock the page, and then unlock it here?
> 
> Lock is require to avoid race with truncate / punch hole. Like if truncate
> happens after get_pfn(), but before it gets into SEPT we are screwed.

Getting the PFN into the SPTE doesn't provide protection in and of itself.  The
protection against truncation and whatnot comes from KVM getting a notification
and either retrying the fault (notification acquires mmu_lock before
direct_page_fault()), or blocking the notification (truncate / punch hole) until
after KVM installs the SPTE.  I.e. KVM just needs to ensure it doesn't install a
SPTE _after_ getting notified.

If the API is similar to gup(), i.e. only elevates the refcount but doesn't lock
the page, then there's no need for a separate kvm_private_mem_put_pfn(), and in
fact no need for ->put_unlock_pfn() because can KVM do set_page_dirty() and
put_page() directly as needed using all of KVM's existing mechanisms.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7e258cc94152..c84835762249 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -176,6 +176,7 @@  struct kvm_page_fault {
 
 	/* Derived from mmu and global state.  */
 	const bool is_tdp;
+	const bool is_private;
 	const bool nx_huge_page_workaround_enabled;
 
 	/*
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index afe18d70ece7..e18460e0d743 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2899,6 +2899,9 @@  int kvm_mmu_max_mapping_level(struct kvm *kvm,
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
+	if (kvm_slot_is_private(slot))
+		return max_level;
+
 	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
 	return min(host_level, max_level);
 }
@@ -3948,10 +3951,54 @@  static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
+static inline u8 order_to_level(int order)
+{
+	enum pg_level level;
+
+	for (level = KVM_MAX_HUGEPAGE_LEVEL; level > PG_LEVEL_4K; level--)
+		if (order >= page_level_shift(level) - PAGE_SHIFT)
+			return level;
+	return level;
+}
+
+static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
+				   struct kvm_page_fault *fault)
+{
+	int order;
+	struct kvm_memory_slot *slot = fault->slot;
+	bool private_exist = !kvm_private_mem_get_pfn(slot, fault->gfn,
+						      &fault->pfn, &order);
+
+	if (fault->is_private != private_exist) {
+		if (private_exist)
+			kvm_private_mem_put_pfn(slot, fault->pfn);
+
+		vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
+		if (fault->is_private)
+			vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE;
+		else
+			vcpu->run->memory.flags = 0;
+		vcpu->run->memory.padding = 0;
+		vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT;
+		vcpu->run->memory.size = PAGE_SIZE;
+		return RET_PF_USER;
+	}
+
+	if (fault->is_private) {
+		fault->max_level = min(order_to_level(order), fault->max_level);
+		fault->map_writable = !(slot->flags & KVM_MEM_READONLY);
+		return RET_PF_FIXED;
+	}
+
+	/* Fault is shared, fallthrough to the standard path. */
+	return RET_PF_CONTINUE;
+}
+
 static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_memory_slot *slot = fault->slot;
 	bool async;
+	int r;
 
 	/*
 	 * Retry the page fault if the gfn hit a memslot that is being deleted
@@ -3980,6 +4027,12 @@  static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 			return RET_PF_EMULATE;
 	}
 
+	if (kvm_slot_is_private(slot)) {
+		r = kvm_faultin_pfn_private(vcpu, fault);
+		if (r != RET_PF_CONTINUE)
+			return r == RET_PF_FIXED ? RET_PF_CONTINUE : r;
+	}
+
 	async = false;
 	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
 					  fault->write, &fault->map_writable,
@@ -4028,8 +4081,11 @@  static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
 	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
 		return true;
 
-	return fault->slot &&
-	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+	if (fault->is_private)
+		return mmu_notifier_retry(vcpu->kvm, mmu_seq);
+	else
+		return fault->slot &&
+			mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
 }
 
 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -4088,7 +4144,12 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		read_unlock(&vcpu->kvm->mmu_lock);
 	else
 		write_unlock(&vcpu->kvm->mmu_lock);
-	kvm_release_pfn_clean(fault->pfn);
+
+	if (fault->is_private)
+		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
+	else
+		kvm_release_pfn_clean(fault->pfn);
+
 	return r;
 }
 
@@ -5372,6 +5433,9 @@  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
 			return -EIO;
 	}
 
+	if (r == RET_PF_USER)
+		return 0;
+
 	if (r < 0)
 		return r;
 	if (r != RET_PF_EMULATE)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index c0e502b17ef7..14932cf97655 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -147,6 +147,7 @@  unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
  * RET_PF_RETRY: let CPU fault again on the address.
  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_USER: need to exit to userspace to handle this fault.
  * RET_PF_FIXED: The faulting entry has been fixed.
  * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
  *
@@ -163,6 +164,7 @@  enum {
 	RET_PF_RETRY,
 	RET_PF_EMULATE,
 	RET_PF_INVALID,
+	RET_PF_USER,
 	RET_PF_FIXED,
 	RET_PF_SPURIOUS,
 };
@@ -178,4 +180,19 @@  void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+#ifndef CONFIG_HAVE_KVM_PRIVATE_MEM
+static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
+					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
+{
+	WARN_ON_ONCE(1);
+	return -EOPNOTSUPP;
+}
+
+static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
+					   kvm_pfn_t pfn)
+{
+	WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef69..2d7555381955 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -58,6 +58,7 @@  TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
 TRACE_DEFINE_ENUM(RET_PF_RETRY);
 TRACE_DEFINE_ENUM(RET_PF_EMULATE);
 TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_USER);
 TRACE_DEFINE_ENUM(RET_PF_FIXED);
 TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 7f8f1c8dbed2..1d857919a947 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -878,7 +878,10 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 
 out_unlock:
 	write_unlock(&vcpu->kvm->mmu_lock);
-	kvm_release_pfn_clean(fault->pfn);
+	if (fault->is_private)
+		kvm_private_mem_put_pfn(fault->slot, fault->pfn);
+	else
+		kvm_release_pfn_clean(fault->pfn);
 	return r;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3fd168972ecd..b0a7910505ed 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2241,4 +2241,26 @@  static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 /* Max number of entries allowed for each kvm dirty ring */
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
+#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
+static inline int kvm_private_mem_get_pfn(struct kvm_memory_slot *slot,
+					  gfn_t gfn, kvm_pfn_t *pfn, int *order)
+{
+	int ret;
+	pfn_t pfnt;
+	pgoff_t index = gfn - slot->base_gfn +
+			(slot->private_offset >> PAGE_SHIFT);
+
+	ret = slot->notifier.bs->get_lock_pfn(slot->private_file, index, &pfnt,
+						order);
+	*pfn = pfn_t_to_pfn(pfnt);
+	return ret;
+}
+
+static inline void kvm_private_mem_put_pfn(struct kvm_memory_slot *slot,
+					   kvm_pfn_t pfn)
+{
+	slot->notifier.bs->put_unlock_pfn(pfn_to_pfn_t(pfn));
+}
+#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
+
 #endif