diff mbox series

[33/89] KVM: arm64: Handle guest stage-2 page-tables entirely at EL2

Message ID 20220519134204.5379-34-will@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Base support for the pKVM hypervisor at EL2 | expand

Commit Message

Will Deacon May 19, 2022, 1:41 p.m. UTC
Now that EL2 is able to manage guest stage-2 page-tables, avoid
allocating a separate MMU structure in the host and instead introduce a
new fault handler which responds to guest stage-2 faults by sharing
GUP-pinned pages with the guest via a hypercall. These pages are
recovered (and unpinned) on guest teardown via the page reclaim
hypercall.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kvm_asm.h   |   1 +
 arch/arm64/include/asm/kvm_host.h  |   6 ++
 arch/arm64/kvm/arm.c               |  10 ++-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c |  49 ++++++++++-
 arch/arm64/kvm/mmu.c               | 137 +++++++++++++++++++++++++++--
 arch/arm64/kvm/pkvm.c              |  17 ++++
 6 files changed, 212 insertions(+), 8 deletions(-)

Comments

Alexandru Elisei May 20, 2022, 4:03 p.m. UTC | #1
Hi,

On Thu, May 19, 2022 at 02:41:08PM +0100, Will Deacon wrote:
> Now that EL2 is able to manage guest stage-2 page-tables, avoid
> allocating a separate MMU structure in the host and instead introduce a
> new fault handler which responds to guest stage-2 faults by sharing
> GUP-pinned pages with the guest via a hypercall. These pages are
> recovered (and unpinned) on guest teardown via the page reclaim
> hypercall.
> 
> Signed-off-by: Will Deacon <will@kernel.org>
> ---
[..]
> +static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> +			  unsigned long hva)
> +{
> +	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
> +	struct mm_struct *mm = current->mm;
> +	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
> +	struct kvm_pinned_page *ppage;
> +	struct kvm *kvm = vcpu->kvm;
> +	struct page *page;
> +	u64 pfn;
> +	int ret;
> +
> +	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
> +	if (ret)
> +		return -ENOMEM;
> +
> +	ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
> +	if (!ppage)
> +		return -ENOMEM;
> +
> +	ret = account_locked_vm(mm, 1, true);
> +	if (ret)
> +		goto free_ppage;
> +
> +	mmap_read_lock(mm);
> +	ret = pin_user_pages(hva, 1, flags, &page, NULL);

When I implemented memory pinning via GUP for the KVM SPE series, I
discovered that the pages were regularly unmapped at stage 2 because of
automatic numa balancing, as change_prot_numa() ends up calling
mmu_notifier_invalidate_range_start().

I was curious how you managed to avoid that, I don't know my way around
pKVM and can't seem to find where that's implemented.

Thanks,
Alex
Will Deacon May 31, 2022, 4:45 p.m. UTC | #2
On Fri, May 20, 2022 at 05:03:29PM +0100, Alexandru Elisei wrote:
> On Thu, May 19, 2022 at 02:41:08PM +0100, Will Deacon wrote:
> > Now that EL2 is able to manage guest stage-2 page-tables, avoid
> > allocating a separate MMU structure in the host and instead introduce a
> > new fault handler which responds to guest stage-2 faults by sharing
> > GUP-pinned pages with the guest via a hypercall. These pages are
> > recovered (and unpinned) on guest teardown via the page reclaim
> > hypercall.
> > 
> > Signed-off-by: Will Deacon <will@kernel.org>
> > ---
> [..]
> > +static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > +			  unsigned long hva)
> > +{
> > +	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
> > +	struct mm_struct *mm = current->mm;
> > +	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
> > +	struct kvm_pinned_page *ppage;
> > +	struct kvm *kvm = vcpu->kvm;
> > +	struct page *page;
> > +	u64 pfn;
> > +	int ret;
> > +
> > +	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
> > +	if (ret)
> > +		return -ENOMEM;
> > +
> > +	ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
> > +	if (!ppage)
> > +		return -ENOMEM;
> > +
> > +	ret = account_locked_vm(mm, 1, true);
> > +	if (ret)
> > +		goto free_ppage;
> > +
> > +	mmap_read_lock(mm);
> > +	ret = pin_user_pages(hva, 1, flags, &page, NULL);
> 
> When I implemented memory pinning via GUP for the KVM SPE series, I
> discovered that the pages were regularly unmapped at stage 2 because of
> automatic numa balancing, as change_prot_numa() ends up calling
> mmu_notifier_invalidate_range_start().
> 
> I was curious how you managed to avoid that, I don't know my way around
> pKVM and can't seem to find where that's implemented.

With this series, we don't take any notice of the MMU notifiers at EL2
so the stage-2 remains intact. The GUP pin will prevent the page from
being migrated as the rmap walker won't be able to drop the mapcount.

It's functional, but we'd definitely like to do better in the long term.
The fd-based approach that I mentioned in the cover letter gets us some of
the way there for protected guests ("private memory"), but non-protected
guests running under pKVM are proving to be pretty challenging (we need to
deal with things like sharing the zero page...).

Will
Huang, Shaoqin June 8, 2022, 1:16 a.m. UTC | #3
On 6/1/2022 12:45 AM, Will Deacon wrote:
> On Fri, May 20, 2022 at 05:03:29PM +0100, Alexandru Elisei wrote:
>> On Thu, May 19, 2022 at 02:41:08PM +0100, Will Deacon wrote:
>>> Now that EL2 is able to manage guest stage-2 page-tables, avoid
>>> allocating a separate MMU structure in the host and instead introduce a
>>> new fault handler which responds to guest stage-2 faults by sharing
>>> GUP-pinned pages with the guest via a hypercall. These pages are
>>> recovered (and unpinned) on guest teardown via the page reclaim
>>> hypercall.
>>>
>>> Signed-off-by: Will Deacon <will@kernel.org>
>>> ---
>> [..]
>>> +static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>> +			  unsigned long hva)
>>> +{
>>> +	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
>>> +	struct mm_struct *mm = current->mm;
>>> +	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
>>> +	struct kvm_pinned_page *ppage;
>>> +	struct kvm *kvm = vcpu->kvm;
>>> +	struct page *page;
>>> +	u64 pfn;
>>> +	int ret;
>>> +
>>> +	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
>>> +	if (ret)
>>> +		return -ENOMEM;
>>> +
>>> +	ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
>>> +	if (!ppage)
>>> +		return -ENOMEM;
>>> +
>>> +	ret = account_locked_vm(mm, 1, true);
>>> +	if (ret)
>>> +		goto free_ppage;
>>> +
>>> +	mmap_read_lock(mm);
>>> +	ret = pin_user_pages(hva, 1, flags, &page, NULL);
>>
>> When I implemented memory pinning via GUP for the KVM SPE series, I
>> discovered that the pages were regularly unmapped at stage 2 because of
>> automatic numa balancing, as change_prot_numa() ends up calling
>> mmu_notifier_invalidate_range_start().
>>
>> I was curious how you managed to avoid that, I don't know my way around
>> pKVM and can't seem to find where that's implemented.
> 
> With this series, we don't take any notice of the MMU notifiers at EL2
> so the stage-2 remains intact. The GUP pin will prevent the page from
> being migrated as the rmap walker won't be able to drop the mapcount.
> 
> It's functional, but we'd definitely like to do better in the long term.
> The fd-based approach that I mentioned in the cover letter gets us some of
> the way there for protected guests ("private memory"), but non-protected
> guests running under pKVM are proving to be pretty challenging (we need to
> deal with things like sharing the zero page...).
> 
> Will

My understanding is that with the pin_user_pages, the page that used by 
guests (both protected and non-protected) will stay for a long time, and 
the page will not be swapped or migrated. So no need to care about the 
MMU notifiers. Is it right?
Alexandru Elisei July 27, 2022, 9:59 a.m. UTC | #4
Hi,

On Wed, Jun 08, 2022 at 09:16:56AM +0800, Huang, Shaoqin wrote:
> 
> On 6/1/2022 12:45 AM, Will Deacon wrote:
> > On Fri, May 20, 2022 at 05:03:29PM +0100, Alexandru Elisei wrote:
> > > On Thu, May 19, 2022 at 02:41:08PM +0100, Will Deacon wrote:
> > > > Now that EL2 is able to manage guest stage-2 page-tables, avoid
> > > > allocating a separate MMU structure in the host and instead introduce a
> > > > new fault handler which responds to guest stage-2 faults by sharing
> > > > GUP-pinned pages with the guest via a hypercall. These pages are
> > > > recovered (and unpinned) on guest teardown via the page reclaim
> > > > hypercall.
> > > > 
> > > > Signed-off-by: Will Deacon <will@kernel.org>
> > > > ---
> > > [..]
> > > > +static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > > > +			  unsigned long hva)
> > > > +{
> > > > +	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
> > > > +	struct mm_struct *mm = current->mm;
> > > > +	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
> > > > +	struct kvm_pinned_page *ppage;
> > > > +	struct kvm *kvm = vcpu->kvm;
> > > > +	struct page *page;
> > > > +	u64 pfn;
> > > > +	int ret;
> > > > +
> > > > +	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
> > > > +	if (ret)
> > > > +		return -ENOMEM;
> > > > +
> > > > +	ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
> > > > +	if (!ppage)
> > > > +		return -ENOMEM;
> > > > +
> > > > +	ret = account_locked_vm(mm, 1, true);
> > > > +	if (ret)
> > > > +		goto free_ppage;
> > > > +
> > > > +	mmap_read_lock(mm);
> > > > +	ret = pin_user_pages(hva, 1, flags, &page, NULL);
> > > 
> > > When I implemented memory pinning via GUP for the KVM SPE series, I
> > > discovered that the pages were regularly unmapped at stage 2 because of
> > > automatic numa balancing, as change_prot_numa() ends up calling
> > > mmu_notifier_invalidate_range_start().
> > > 
> > > I was curious how you managed to avoid that, I don't know my way around
> > > pKVM and can't seem to find where that's implemented.
> > 
> > With this series, we don't take any notice of the MMU notifiers at EL2
> > so the stage-2 remains intact. The GUP pin will prevent the page from
> > being migrated as the rmap walker won't be able to drop the mapcount.
> > 
> > It's functional, but we'd definitely like to do better in the long term.
> > The fd-based approach that I mentioned in the cover letter gets us some of
> > the way there for protected guests ("private memory"), but non-protected
> > guests running under pKVM are proving to be pretty challenging (we need to
> > deal with things like sharing the zero page...).
> > 
> > Will
> 
> My understanding is that with the pin_user_pages, the page that used by
> guests (both protected and non-protected) will stay for a long time, and the
> page will not be swapped or migrated. So no need to care about the MMU
> notifiers. Is it right?

There are two things here.

First, pinning a page means making the data persistent in memory. From
Documentation/core-api/pin_user_pages.rst:

"FOLL_PIN is a *replacement* for FOLL_GET, and is for short term pins on
pages whose data *will* get accessed. As such, FOLL_PIN is a "more severe"
form of pinning. And finally, FOLL_LONGTERM is an even more restrictive
case that has FOLL_PIN as a prerequisite: this is for pages that will be
pinned longterm, and whose data will be accessed."

It does not mean that the translation table entry for the page is not
modified for as long as the pin exists. In the example I gave, automatic
NUMA balancing changes the protection of translation table entries to
PAGE_NONE, which will invoke the MMU notifers to unmap the corresponding
stage 2 entries, regardless of the fact that the pinned pages will not get
migrated the next time they are accessed.

There are other mechanisms in the kernel that do that, for example
split_huge_pmd(), which must always succeed, even if the THP is pinned (it
transfers the refcounts among the pages): "Note that split_huge_pmd()
doesn't have any limitations on refcounting: pmd can be split at any point
and never fails" (Documentation/vm/transhuge.rst, also see
__split_huge_pmd() from mm/huge_memory.c).

KSM also does that: it invokes the invalidate_range_start MMU notifier
before backing out of the merge because of the refcount (see mm/ksm.c::
try_to_merge_one_page -> write_protect_page).

This brings me to my second point: one might rightfully ask themselves (I
did!), why not invoke the MMU notifiers *after* checking that the page is
not pinned? It turns out that that is not reliable, because the refcount is
increased by GUP with the page lock held (which is a spinlock), but by
their design the invalidate_range_start MMU notifiers must be called from
interruptible + preemptible context. The only way to avoid races would be
to call the MMU notifier while holding the page table lock, which is
impossible.

Hope my explanation has been adequate.

Thanks,
Alex
Huang, Shaoqin July 28, 2022, 6:50 a.m. UTC | #5
On 7/27/2022 5:59 PM, Alexandru Elisei wrote:
> Hi,
> 
> On Wed, Jun 08, 2022 at 09:16:56AM +0800, Huang, Shaoqin wrote:
>>
>> On 6/1/2022 12:45 AM, Will Deacon wrote:
>>> On Fri, May 20, 2022 at 05:03:29PM +0100, Alexandru Elisei wrote:
>>>> On Thu, May 19, 2022 at 02:41:08PM +0100, Will Deacon wrote:
>>>>> Now that EL2 is able to manage guest stage-2 page-tables, avoid
>>>>> allocating a separate MMU structure in the host and instead introduce a
>>>>> new fault handler which responds to guest stage-2 faults by sharing
>>>>> GUP-pinned pages with the guest via a hypercall. These pages are
>>>>> recovered (and unpinned) on guest teardown via the page reclaim
>>>>> hypercall.
>>>>>
>>>>> Signed-off-by: Will Deacon <will@kernel.org>
>>>>> ---
>>>> [..]
>>>>> +static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>> +			  unsigned long hva)
>>>>> +{
>>>>> +	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
>>>>> +	struct mm_struct *mm = current->mm;
>>>>> +	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
>>>>> +	struct kvm_pinned_page *ppage;
>>>>> +	struct kvm *kvm = vcpu->kvm;
>>>>> +	struct page *page;
>>>>> +	u64 pfn;
>>>>> +	int ret;
>>>>> +
>>>>> +	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
>>>>> +	if (ret)
>>>>> +		return -ENOMEM;
>>>>> +
>>>>> +	ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
>>>>> +	if (!ppage)
>>>>> +		return -ENOMEM;
>>>>> +
>>>>> +	ret = account_locked_vm(mm, 1, true);
>>>>> +	if (ret)
>>>>> +		goto free_ppage;
>>>>> +
>>>>> +	mmap_read_lock(mm);
>>>>> +	ret = pin_user_pages(hva, 1, flags, &page, NULL);
>>>>
>>>> When I implemented memory pinning via GUP for the KVM SPE series, I
>>>> discovered that the pages were regularly unmapped at stage 2 because of
>>>> automatic numa balancing, as change_prot_numa() ends up calling
>>>> mmu_notifier_invalidate_range_start().
>>>>
>>>> I was curious how you managed to avoid that, I don't know my way around
>>>> pKVM and can't seem to find where that's implemented.
>>>
>>> With this series, we don't take any notice of the MMU notifiers at EL2
>>> so the stage-2 remains intact. The GUP pin will prevent the page from
>>> being migrated as the rmap walker won't be able to drop the mapcount.
>>>
>>> It's functional, but we'd definitely like to do better in the long term.
>>> The fd-based approach that I mentioned in the cover letter gets us some of
>>> the way there for protected guests ("private memory"), but non-protected
>>> guests running under pKVM are proving to be pretty challenging (we need to
>>> deal with things like sharing the zero page...).
>>>
>>> Will
>>
>> My understanding is that with the pin_user_pages, the page that used by
>> guests (both protected and non-protected) will stay for a long time, and the
>> page will not be swapped or migrated. So no need to care about the MMU
>> notifiers. Is it right?
> 
> There are two things here.
> 
> First, pinning a page means making the data persistent in memory. From
> Documentation/core-api/pin_user_pages.rst:
> 
> "FOLL_PIN is a *replacement* for FOLL_GET, and is for short term pins on
> pages whose data *will* get accessed. As such, FOLL_PIN is a "more severe"
> form of pinning. And finally, FOLL_LONGTERM is an even more restrictive
> case that has FOLL_PIN as a prerequisite: this is for pages that will be
> pinned longterm, and whose data will be accessed."
> 
> It does not mean that the translation table entry for the page is not
> modified for as long as the pin exists. In the example I gave, automatic
> NUMA balancing changes the protection of translation table entries to
> PAGE_NONE, which will invoke the MMU notifers to unmap the corresponding
> stage 2 entries, regardless of the fact that the pinned pages will not get
> migrated the next time they are accessed.
> 
> There are other mechanisms in the kernel that do that, for example
> split_huge_pmd(), which must always succeed, even if the THP is pinned (it
> transfers the refcounts among the pages): "Note that split_huge_pmd()
> doesn't have any limitations on refcounting: pmd can be split at any point
> and never fails" (Documentation/vm/transhuge.rst, also see
> __split_huge_pmd() from mm/huge_memory.c).
> 
> KSM also does that: it invokes the invalidate_range_start MMU notifier
> before backing out of the merge because of the refcount (see mm/ksm.c::
> try_to_merge_one_page -> write_protect_page).
> 
> This brings me to my second point: one might rightfully ask themselves (I
> did!), why not invoke the MMU notifiers *after* checking that the page is
> not pinned? It turns out that that is not reliable, because the refcount is
> increased by GUP with the page lock held (which is a spinlock), but by
> their design the invalidate_range_start MMU notifiers must be called from
> interruptible + preemptible context. The only way to avoid races would be
> to call the MMU notifier while holding the page table lock, which is
> impossible.
> 
> Hope my explanation has been adequate.
> 
> Thanks,
> Alex

Thanks for your clear explanation.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index a68381699c40..931a351da3f2 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -65,6 +65,7 @@  enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_reclaim_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_host_map_guest,
 	__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
 	__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
 	__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3c6ed1f3887d..9252841850e4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -158,10 +158,16 @@  struct kvm_s2_mmu {
 struct kvm_arch_memory_slot {
 };
 
+struct kvm_pinned_page {
+	struct list_head	link;
+	struct page		*page;
+};
+
 struct kvm_protected_vm {
 	unsigned int shadow_handle;
 	struct mutex shadow_lock;
 	struct kvm_hyp_memcache teardown_mc;
+	struct list_head pinned_pages;
 };
 
 struct kvm_arch {
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 694ba3792e9d..5b41551a978b 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -358,7 +358,11 @@  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
 		static_branch_dec(&userspace_irqchip_in_use);
 
-	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+	if (is_protected_kvm_enabled())
+		free_hyp_memcache(&vcpu->arch.pkvm_memcache);
+	else
+		kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+
 	kvm_timer_vcpu_terminate(vcpu);
 	kvm_pmu_vcpu_destroy(vcpu);
 
@@ -385,6 +389,9 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct kvm_s2_mmu *mmu;
 	int *last_ran;
 
+	if (is_protected_kvm_enabled())
+		goto nommu;
+
 	mmu = vcpu->arch.hw_mmu;
 	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
 
@@ -402,6 +409,7 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		*last_ran = vcpu->vcpu_id;
 	}
 
+nommu:
 	vcpu->cpu = cpu;
 
 	kvm_vgic_load(vcpu);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 7a0d95e28e00..245d267064b3 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -32,8 +32,6 @@  static void flush_shadow_state(struct kvm_shadow_vcpu_state *shadow_state)
 	shadow_vcpu->arch.sve_state	= kern_hyp_va(host_vcpu->arch.sve_state);
 	shadow_vcpu->arch.sve_max_vl	= host_vcpu->arch.sve_max_vl;
 
-	shadow_vcpu->arch.hw_mmu	= host_vcpu->arch.hw_mmu;
-
 	shadow_vcpu->arch.hcr_el2	= host_vcpu->arch.hcr_el2;
 	shadow_vcpu->arch.mdcr_el2	= host_vcpu->arch.mdcr_el2;
 	shadow_vcpu->arch.cptr_el2	= host_vcpu->arch.cptr_el2;
@@ -107,6 +105,52 @@  static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) =  ret;
 }
 
+static int pkvm_refill_memcache(struct kvm_vcpu *shadow_vcpu,
+				struct kvm_vcpu *host_vcpu)
+{
+	struct kvm_shadow_vcpu_state *shadow_vcpu_state = get_shadow_state(shadow_vcpu);
+	u64 nr_pages = VTCR_EL2_LVLS(shadow_vcpu_state->shadow_vm->kvm.arch.vtcr) - 1;
+
+	return refill_memcache(&shadow_vcpu->arch.pkvm_memcache, nr_pages,
+			       &host_vcpu->arch.pkvm_memcache);
+}
+
+static void handle___pkvm_host_map_guest(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, pfn, host_ctxt, 1);
+	DECLARE_REG(u64, gfn, host_ctxt, 2);
+	DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 3);
+	struct kvm_shadow_vcpu_state *shadow_state;
+	struct kvm_vcpu *shadow_vcpu;
+	struct kvm *host_kvm;
+	unsigned int handle;
+	int ret = -EINVAL;
+
+	if (!is_protected_kvm_enabled())
+		goto out;
+
+	host_vcpu = kern_hyp_va(host_vcpu);
+	host_kvm = kern_hyp_va(host_vcpu->kvm);
+	handle = host_kvm->arch.pkvm.shadow_handle;
+	shadow_state = pkvm_load_shadow_vcpu_state(handle, host_vcpu->vcpu_idx);
+	if (!shadow_state)
+		goto out;
+
+	host_vcpu = shadow_state->host_vcpu;
+	shadow_vcpu = &shadow_state->shadow_vcpu;
+
+	/* Topup shadow memcache with the host's */
+	ret = pkvm_refill_memcache(shadow_vcpu, host_vcpu);
+	if (ret)
+		goto out_put_state;
+
+	ret = __pkvm_host_share_guest(pfn, gfn, shadow_vcpu);
+out_put_state:
+	pkvm_put_shadow_vcpu_state(shadow_state);
+out:
+	cpu_reg(host_ctxt, 1) =  ret;
+}
+
 static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
@@ -297,6 +341,7 @@  static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_host_share_hyp),
 	HANDLE_FUNC(__pkvm_host_unshare_hyp),
 	HANDLE_FUNC(__pkvm_host_reclaim_page),
+	HANDLE_FUNC(__pkvm_host_map_guest),
 	HANDLE_FUNC(__kvm_adjust_pc),
 	HANDLE_FUNC(__kvm_vcpu_run),
 	HANDLE_FUNC(__kvm_flush_vm_context),
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index df92b5f7ac63..c74c431588a3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -190,6 +190,22 @@  static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
 	__unmap_stage2_range(mmu, start, size, true);
 }
 
+static void pkvm_stage2_flush(struct kvm *kvm)
+{
+	struct kvm_pinned_page *ppage;
+
+	/*
+	 * Contrary to stage2_apply_range(), we don't need to check
+	 * whether the VM is being torn down, as this is always called
+	 * from a vcpu thread, and the list is only ever freed on VM
+	 * destroy (which only occurs when all vcpu are gone).
+	 */
+	list_for_each_entry(ppage, &kvm->arch.pkvm.pinned_pages, link) {
+		__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
+		cond_resched_rwlock_write(&kvm->mmu_lock);
+	}
+}
+
 static void stage2_flush_memslot(struct kvm *kvm,
 				 struct kvm_memory_slot *memslot)
 {
@@ -215,9 +231,13 @@  static void stage2_flush_vm(struct kvm *kvm)
 	idx = srcu_read_lock(&kvm->srcu);
 	write_lock(&kvm->mmu_lock);
 
-	slots = kvm_memslots(kvm);
-	kvm_for_each_memslot(memslot, bkt, slots)
-		stage2_flush_memslot(kvm, memslot);
+	if (!is_protected_kvm_enabled()) {
+		slots = kvm_memslots(kvm);
+		kvm_for_each_memslot(memslot, bkt, slots)
+			stage2_flush_memslot(kvm, memslot);
+	} else if (!kvm_vm_is_protected(kvm)) {
+		pkvm_stage2_flush(kvm);
+	}
 
 	write_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -636,7 +656,9 @@  int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 		return -EINVAL;
 
 	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
-	if (phys_shift) {
+	if (is_protected_kvm_enabled()) {
+		phys_shift = kvm_ipa_limit;
+	} else if (phys_shift) {
 		if (phys_shift > kvm_ipa_limit ||
 		    phys_shift < ARM64_MIN_PARANGE_BITS)
 			return -EINVAL;
@@ -652,6 +674,11 @@  int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
 	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+	INIT_LIST_HEAD(&kvm->arch.pkvm.pinned_pages);
+	mmu->arch = &kvm->arch;
+
+	if (is_protected_kvm_enabled())
+		return 0;
 
 	if (mmu->pgt != NULL) {
 		kvm_err("kvm_arch already initialized?\n");
@@ -760,6 +787,9 @@  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
 	struct kvm_pgtable *pgt = NULL;
 
+	if (is_protected_kvm_enabled())
+		return;
+
 	write_lock(&kvm->mmu_lock);
 	pgt = mmu->pgt;
 	if (pgt) {
@@ -1113,6 +1143,99 @@  static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
 	return 0;
 }
 
+static int pkvm_host_map_guest(u64 pfn, u64 gfn, struct kvm_vcpu *vcpu)
+{
+	int ret = kvm_call_hyp_nvhe(__pkvm_host_map_guest, pfn, gfn, vcpu);
+
+	/*
+	 * Getting -EPERM at this point implies that the pfn has already been
+	 * mapped. This should only ever happen when two vCPUs faulted on the
+	 * same page, and the current one lost the race to do the mapping.
+	 */
+	return (ret == -EPERM) ? -EAGAIN : ret;
+}
+
+static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+			  unsigned long hva)
+{
+	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
+	struct mm_struct *mm = current->mm;
+	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
+	struct kvm_pinned_page *ppage;
+	struct kvm *kvm = vcpu->kvm;
+	struct page *page;
+	u64 pfn;
+	int ret;
+
+	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
+	if (ret)
+		return -ENOMEM;
+
+	ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
+	if (!ppage)
+		return -ENOMEM;
+
+	ret = account_locked_vm(mm, 1, true);
+	if (ret)
+		goto free_ppage;
+
+	mmap_read_lock(mm);
+	ret = pin_user_pages(hva, 1, flags, &page, NULL);
+	mmap_read_unlock(mm);
+
+	if (ret == -EHWPOISON) {
+		kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
+		ret = 0;
+		goto dec_account;
+	} else if (ret != 1) {
+		ret = -EFAULT;
+		goto dec_account;
+	} else if (!PageSwapBacked(page)) {
+		/*
+		 * We really can't deal with page-cache pages returned by GUP
+		 * because (a) we may trigger writeback of a page for which we
+		 * no longer have access and (b) page_mkclean() won't find the
+		 * stage-2 mapping in the rmap so we can get out-of-whack with
+		 * the filesystem when marking the page dirty during unpinning
+		 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
+		 * without asking ext4 first")).
+		 *
+		 * Ideally we'd just restrict ourselves to anonymous pages, but
+		 * we also want to allow memfd (i.e. shmem) pages, so check for
+		 * pages backed by swap in the knowledge that the GUP pin will
+		 * prevent try_to_unmap() from succeeding.
+		 */
+		ret = -EIO;
+		goto dec_account;
+	}
+
+	write_lock(&kvm->mmu_lock);
+	pfn = page_to_pfn(page);
+	ret = pkvm_host_map_guest(pfn, fault_ipa >> PAGE_SHIFT, vcpu);
+	if (ret) {
+		if (ret == -EAGAIN)
+			ret = 0;
+		goto unpin;
+	}
+
+	ppage->page = page;
+	INIT_LIST_HEAD(&ppage->link);
+	list_add(&ppage->link, &kvm->arch.pkvm.pinned_pages);
+	write_unlock(&kvm->mmu_lock);
+
+	return 0;
+
+unpin:
+	write_unlock(&kvm->mmu_lock);
+	unpin_user_pages(&page, 1);
+dec_account:
+	account_locked_vm(mm, 1, false);
+free_ppage:
+	kfree(ppage);
+
+	return ret;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -1470,7 +1593,11 @@  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		goto out_unlock;
 	}
 
-	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+	if (is_protected_kvm_enabled())
+		ret = pkvm_mem_abort(vcpu, fault_ipa, hva);
+	else
+		ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+
 	if (ret == 0)
 		ret = 1;
 out:
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index b174d6dfde36..40e5490ef453 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -6,6 +6,7 @@ 
 
 #include <linux/kvm_host.h>
 #include <linux/memblock.h>
+#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/sort.h>
 
@@ -183,12 +184,28 @@  int kvm_shadow_create(struct kvm *kvm)
 
 void kvm_shadow_destroy(struct kvm *kvm)
 {
+	struct kvm_pinned_page *ppage, *tmp;
+	struct mm_struct *mm = current->mm;
+	struct list_head *ppages;
+
 	if (kvm->arch.pkvm.shadow_handle)
 		WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_shadow,
 					  kvm->arch.pkvm.shadow_handle));
 
 	kvm->arch.pkvm.shadow_handle = 0;
 	free_hyp_memcache(&kvm->arch.pkvm.teardown_mc);
+
+	ppages = &kvm->arch.pkvm.pinned_pages;
+	list_for_each_entry_safe(ppage, tmp, ppages, link) {
+		WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_reclaim_page,
+					  page_to_pfn(ppage->page)));
+		cond_resched();
+
+		account_locked_vm(mm, 1, false);
+		unpin_user_pages_dirty_lock(&ppage->page, 1, true);
+		list_del(&ppage->link);
+		kfree(ppage);
+	}
 }
 
 int kvm_init_pvm(struct kvm *kvm)