diff mbox series

[v3,29/32] KVM: arm64: Wrap the host with a stage 2

Message ID 20210302150002.3685113-30-qperret@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: A stage 2 for the host | expand

Commit Message

Quentin Perret March 2, 2021, 2:59 p.m. UTC
When KVM runs in protected nVHE mode, make use of a stage 2 page-table
to give the hypervisor some control over the host memory accesses. The
host stage 2 is created lazily using large block mappings if possible,
and will default to page mappings in absence of a better solution.

From this point on, memory accesses from the host to protected memory
regions (e.g. marked PROT_NONE) are fatal and lead to hyp_panic().

Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h              |   1 +
 arch/arm64/include/asm/kvm_cpufeature.h       |   2 +
 arch/arm64/kernel/image-vars.h                |   3 +
 arch/arm64/kvm/arm.c                          |  10 +
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  34 +++
 arch/arm64/kvm/hyp/nvhe/Makefile              |   2 +-
 arch/arm64/kvm/hyp/nvhe/hyp-init.S            |   1 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c            |  11 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 213 ++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c               |   5 +
 arch/arm64/kvm/hyp/nvhe/switch.c              |   7 +-
 arch/arm64/kvm/hyp/nvhe/tlb.c                 |   4 +-
 12 files changed, 286 insertions(+), 7 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
 create mode 100644 arch/arm64/kvm/hyp/nvhe/mem_protect.c

Comments

Will Deacon March 5, 2021, 7:29 p.m. UTC | #1
On Tue, Mar 02, 2021 at 02:59:59PM +0000, Quentin Perret wrote:
> When KVM runs in protected nVHE mode, make use of a stage 2 page-table
> to give the hypervisor some control over the host memory accesses. The
> host stage 2 is created lazily using large block mappings if possible,
> and will default to page mappings in absence of a better solution.
> 
> From this point on, memory accesses from the host to protected memory
> regions (e.g. marked PROT_NONE) are fatal and lead to hyp_panic().
> 
> Signed-off-by: Quentin Perret <qperret@google.com>
> ---
>  arch/arm64/include/asm/kvm_asm.h              |   1 +
>  arch/arm64/include/asm/kvm_cpufeature.h       |   2 +
>  arch/arm64/kernel/image-vars.h                |   3 +
>  arch/arm64/kvm/arm.c                          |  10 +
>  arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  34 +++
>  arch/arm64/kvm/hyp/nvhe/Makefile              |   2 +-
>  arch/arm64/kvm/hyp/nvhe/hyp-init.S            |   1 +
>  arch/arm64/kvm/hyp/nvhe/hyp-main.c            |  11 +
>  arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 213 ++++++++++++++++++
>  arch/arm64/kvm/hyp/nvhe/setup.c               |   5 +
>  arch/arm64/kvm/hyp/nvhe/switch.c              |   7 +-
>  arch/arm64/kvm/hyp/nvhe/tlb.c                 |   4 +-
>  12 files changed, 286 insertions(+), 7 deletions(-)
>  create mode 100644 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
>  create mode 100644 arch/arm64/kvm/hyp/nvhe/mem_protect.c

[...]

> diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> new file mode 100644
> index 000000000000..d293cb328cc4
> --- /dev/null
> +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> @@ -0,0 +1,34 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2020 Google LLC
> + * Author: Quentin Perret <qperret@google.com>
> + */
> +
> +#ifndef __KVM_NVHE_MEM_PROTECT__
> +#define __KVM_NVHE_MEM_PROTECT__
> +#include <linux/kvm_host.h>
> +#include <asm/kvm_hyp.h>
> +#include <asm/kvm_pgtable.h>
> +#include <asm/virt.h>
> +#include <nvhe/spinlock.h>
> +
> +struct host_kvm {
> +	struct kvm_arch arch;
> +	struct kvm_pgtable pgt;
> +	struct kvm_pgtable_mm_ops mm_ops;
> +	hyp_spinlock_t lock;
> +};
> +extern struct host_kvm host_kvm;
> +
> +int __pkvm_prot_finalize(void);
> +int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
> +void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
> +
> +static __always_inline void __load_host_stage2(void)
> +{
> +	if (static_branch_likely(&kvm_protected_mode_initialized))
> +		__load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
> +	else
> +		write_sysreg(0, vttbr_el2);

I realise you've just moved the code, but why is this needed? All we've
done is point the stage-2 ttb at 0x0 (i.e. we haven't disabled anything).

> +#endif /* __KVM_NVHE_MEM_PROTECT__ */
> diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
> index e204ea77ab27..ce49795324a7 100644
> --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> @@ -14,7 +14,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
>  
>  obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
>  	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \
> -	 cache.o cpufeature.o setup.o mm.o
> +	 cache.o cpufeature.o setup.o mm.o mem_protect.o
>  obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
>  	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
>  obj-y += $(lib-objs)
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> index f312672d895e..6fa01b04954f 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> @@ -119,6 +119,7 @@ alternative_else_nop_endif
>  
>  	/* Invalidate the stale TLBs from Bootloader */
>  	tlbi	alle2
> +	tlbi	vmalls12e1
>  	dsb	sy
>  
>  	/*
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> index ae6503c9be15..f47028d3fd0a 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> @@ -13,6 +13,7 @@
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +#include <nvhe/mem_protect.h>
>  #include <nvhe/mm.h>
>  #include <nvhe/trap_handler.h>
>  
> @@ -151,6 +152,10 @@ static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ct
>  	cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
>  }
>  
> +static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
> +{
> +	cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
> +}
>  typedef void (*hcall_t)(struct kvm_cpu_context *);
>  
>  #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
> @@ -174,6 +179,7 @@ static const hcall_t host_hcall[] = {
>  	HANDLE_FUNC(__pkvm_cpu_set_vector),
>  	HANDLE_FUNC(__pkvm_create_mappings),
>  	HANDLE_FUNC(__pkvm_create_private_mapping),
> +	HANDLE_FUNC(__pkvm_prot_finalize),
>  };
>  
>  static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
> @@ -226,6 +232,11 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
>  	case ESR_ELx_EC_SMC64:
>  		handle_host_smc(host_ctxt);
>  		break;
> +	case ESR_ELx_EC_IABT_LOW:
> +		fallthrough;
> +	case ESR_ELx_EC_DABT_LOW:
> +		handle_host_mem_abort(host_ctxt);
> +		break;
>  	default:
>  		hyp_panic();
>  	}
> diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> new file mode 100644
> index 000000000000..2252ad1a8945
> --- /dev/null
> +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> @@ -0,0 +1,213 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2020 Google LLC
> + * Author: Quentin Perret <qperret@google.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <asm/kvm_cpufeature.h>
> +#include <asm/kvm_emulate.h>
> +#include <asm/kvm_hyp.h>
> +#include <asm/kvm_mmu.h>
> +#include <asm/kvm_pgtable.h>
> +#include <asm/stage2_pgtable.h>
> +
> +#include <hyp/switch.h>
> +
> +#include <nvhe/gfp.h>
> +#include <nvhe/memory.h>
> +#include <nvhe/mem_protect.h>
> +#include <nvhe/mm.h>
> +
> +extern unsigned long hyp_nr_cpus;
> +struct host_kvm host_kvm;
> +
> +struct hyp_pool host_s2_mem;
> +struct hyp_pool host_s2_dev;
> +
> +static void *host_s2_zalloc_pages_exact(size_t size)
> +{
> +	return hyp_alloc_pages(&host_s2_mem, get_order(size));
> +}
> +
> +static void *host_s2_zalloc_page(void *pool)
> +{
> +	return hyp_alloc_pages(pool, 0);
> +}
> +
> +static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
> +{
> +	unsigned long nr_pages, pfn;
> +	int ret;
> +
> +	pfn = hyp_virt_to_pfn(mem_pgt_pool);
> +	nr_pages = host_s2_mem_pgtable_pages();
> +	ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
> +	if (ret)
> +		return ret;
> +
> +	pfn = hyp_virt_to_pfn(dev_pgt_pool);
> +	nr_pages = host_s2_dev_pgtable_pages();
> +	ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
> +	if (ret)
> +		return ret;
> +
> +	host_kvm.mm_ops.zalloc_pages_exact = host_s2_zalloc_pages_exact;
> +	host_kvm.mm_ops.zalloc_page = host_s2_zalloc_page;
> +	host_kvm.mm_ops.phys_to_virt = hyp_phys_to_virt;
> +	host_kvm.mm_ops.virt_to_phys = hyp_virt_to_phys;
> +	host_kvm.mm_ops.page_count = hyp_page_count;
> +	host_kvm.mm_ops.get_page = hyp_get_page;
> +	host_kvm.mm_ops.put_page = hyp_put_page;

Same comment I had earlier on about struct initialisation -- I think you
can use the same trict here for the host_kvm.mm_ops.

> +
> +	return 0;
> +}
> +
> +static void prepare_host_vtcr(void)
> +{
> +	u32 parange, phys_shift;
> +	u64 mmfr0, mmfr1;
> +
> +	mmfr0 = arm64_ftr_reg_id_aa64mmfr0_el1.sys_val;
> +	mmfr1 = arm64_ftr_reg_id_aa64mmfr1_el1.sys_val;
> +
> +	/* The host stage 2 is id-mapped, so use parange for T0SZ */
> +	parange = kvm_get_parange(mmfr0);
> +	phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
> +
> +	host_kvm.arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
> +}
> +
> +int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
> +{
> +	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
> +	int ret;
> +
> +	prepare_host_vtcr();
> +	hyp_spin_lock_init(&host_kvm.lock);
> +
> +	ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
> +	if (ret)
> +		return ret;
> +
> +	ret = kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch,
> +				      &host_kvm.mm_ops);
> +	if (ret)
> +		return ret;
> +
> +	mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
> +	mmu->arch = &host_kvm.arch;
> +	mmu->pgt = &host_kvm.pgt;
> +	mmu->vmid.vmid_gen = 0;
> +	mmu->vmid.vmid = 0;
> +
> +	return 0;
> +}
> +
> +int __pkvm_prot_finalize(void)
> +{
> +	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
> +	struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
> +
> +	params->vttbr = kvm_get_vttbr(mmu);
> +	params->vtcr = host_kvm.arch.vtcr;
> +	params->hcr_el2 |= HCR_VM;
> +	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> +		params->hcr_el2 |= HCR_FWB;
> +	kvm_flush_dcache_to_poc(params, sizeof(*params));
> +
> +	write_sysreg(params->hcr_el2, hcr_el2);
> +	__load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);

AFAICT, there's no ISB here. Do we need one before the TLB invalidation?

> +	__tlbi(vmalls12e1is);
> +	dsb(ish);

Given that the TLB is invalidated on the boot path, please can you add
a comment here about the stale entries which you need to invalidate?

Also, does this need to be inner-shareable? I thought this function ran on
each CPU.

> +	isb();
> +
> +	return 0;
> +}
> +
> +static void host_stage2_unmap_dev_all(void)
> +{
> +	struct kvm_pgtable *pgt = &host_kvm.pgt;
> +	struct memblock_region *reg;
> +	u64 addr = 0;
> +	int i;
> +
> +	/* Unmap all non-memory regions to recycle the pages */
> +	for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
> +		reg = &hyp_memory[i];
> +		kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
> +	}
> +	kvm_pgtable_stage2_unmap(pgt, addr, ULONG_MAX);

Is this just going to return -ERANGE?

> +static int host_stage2_idmap(u64 addr)
> +{
> +	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
> +	struct kvm_mem_range range;
> +	bool is_memory = find_mem_range(addr, &range);
> +	struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
> +	int ret;
> +
> +	if (is_memory)
> +		prot |= KVM_PGTABLE_PROT_X;
> +
> +	hyp_spin_lock(&host_kvm.lock);
> +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> +					      &range, pool);
> +	if (is_memory || ret != -ENOMEM)
> +		goto unlock;
> +	host_stage2_unmap_dev_all();
> +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> +					      &range, pool);

I find this _really_ hard to reason about, as range is passed by reference
and we don't reset it after the first call returns -ENOMEM for an MMIO
mapping. Maybe some commentary on why it's still valid here?

Other than that, this is looking really good to me.

Will
Quentin Perret March 8, 2021, 9:22 a.m. UTC | #2
On Friday 05 Mar 2021 at 19:29:06 (+0000), Will Deacon wrote:
> On Tue, Mar 02, 2021 at 02:59:59PM +0000, Quentin Perret wrote:
> > +static __always_inline void __load_host_stage2(void)
> > +{
> > +	if (static_branch_likely(&kvm_protected_mode_initialized))
> > +		__load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
> > +	else
> > +		write_sysreg(0, vttbr_el2);
> 
> I realise you've just moved the code, but why is this needed? All we've
> done is point the stage-2 ttb at 0x0 (i.e. we haven't disabled anything).

Right, but that is also used for tlb maintenance operations, e.g.
__tlb_switch_to_host() and friends.

> > +#endif /* __KVM_NVHE_MEM_PROTECT__ */
> > diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
> > index e204ea77ab27..ce49795324a7 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> > +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> > @@ -14,7 +14,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
> >  
> >  obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
> >  	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \
> > -	 cache.o cpufeature.o setup.o mm.o
> > +	 cache.o cpufeature.o setup.o mm.o mem_protect.o
> >  obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
> >  	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
> >  obj-y += $(lib-objs)
> > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> > index f312672d895e..6fa01b04954f 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> > @@ -119,6 +119,7 @@ alternative_else_nop_endif
> >  
> >  	/* Invalidate the stale TLBs from Bootloader */
> >  	tlbi	alle2
> > +	tlbi	vmalls12e1
> >  	dsb	sy
> >  
> >  	/*
> > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> > index ae6503c9be15..f47028d3fd0a 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> > @@ -13,6 +13,7 @@
> >  #include <asm/kvm_hyp.h>
> >  #include <asm/kvm_mmu.h>
> >  
> > +#include <nvhe/mem_protect.h>
> >  #include <nvhe/mm.h>
> >  #include <nvhe/trap_handler.h>
> >  
> > @@ -151,6 +152,10 @@ static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ct
> >  	cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
> >  }
> >  
> > +static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
> > +{
> > +	cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
> > +}
> >  typedef void (*hcall_t)(struct kvm_cpu_context *);
> >  
> >  #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
> > @@ -174,6 +179,7 @@ static const hcall_t host_hcall[] = {
> >  	HANDLE_FUNC(__pkvm_cpu_set_vector),
> >  	HANDLE_FUNC(__pkvm_create_mappings),
> >  	HANDLE_FUNC(__pkvm_create_private_mapping),
> > +	HANDLE_FUNC(__pkvm_prot_finalize),
> >  };
> >  
> >  static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
> > @@ -226,6 +232,11 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
> >  	case ESR_ELx_EC_SMC64:
> >  		handle_host_smc(host_ctxt);
> >  		break;
> > +	case ESR_ELx_EC_IABT_LOW:
> > +		fallthrough;
> > +	case ESR_ELx_EC_DABT_LOW:
> > +		handle_host_mem_abort(host_ctxt);
> > +		break;
> >  	default:
> >  		hyp_panic();
> >  	}
> > diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> > new file mode 100644
> > index 000000000000..2252ad1a8945
> > --- /dev/null
> > +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> > @@ -0,0 +1,213 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (C) 2020 Google LLC
> > + * Author: Quentin Perret <qperret@google.com>
> > + */
> > +
> > +#include <linux/kvm_host.h>
> > +#include <asm/kvm_cpufeature.h>
> > +#include <asm/kvm_emulate.h>
> > +#include <asm/kvm_hyp.h>
> > +#include <asm/kvm_mmu.h>
> > +#include <asm/kvm_pgtable.h>
> > +#include <asm/stage2_pgtable.h>
> > +
> > +#include <hyp/switch.h>
> > +
> > +#include <nvhe/gfp.h>
> > +#include <nvhe/memory.h>
> > +#include <nvhe/mem_protect.h>
> > +#include <nvhe/mm.h>
> > +
> > +extern unsigned long hyp_nr_cpus;
> > +struct host_kvm host_kvm;
> > +
> > +struct hyp_pool host_s2_mem;
> > +struct hyp_pool host_s2_dev;
> > +
> > +static void *host_s2_zalloc_pages_exact(size_t size)
> > +{
> > +	return hyp_alloc_pages(&host_s2_mem, get_order(size));
> > +}
> > +
> > +static void *host_s2_zalloc_page(void *pool)
> > +{
> > +	return hyp_alloc_pages(pool, 0);
> > +}
> > +
> > +static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
> > +{
> > +	unsigned long nr_pages, pfn;
> > +	int ret;
> > +
> > +	pfn = hyp_virt_to_pfn(mem_pgt_pool);
> > +	nr_pages = host_s2_mem_pgtable_pages();
> > +	ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
> > +	if (ret)
> > +		return ret;
> > +
> > +	pfn = hyp_virt_to_pfn(dev_pgt_pool);
> > +	nr_pages = host_s2_dev_pgtable_pages();
> > +	ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
> > +	if (ret)
> > +		return ret;
> > +
> > +	host_kvm.mm_ops.zalloc_pages_exact = host_s2_zalloc_pages_exact;
> > +	host_kvm.mm_ops.zalloc_page = host_s2_zalloc_page;
> > +	host_kvm.mm_ops.phys_to_virt = hyp_phys_to_virt;
> > +	host_kvm.mm_ops.virt_to_phys = hyp_virt_to_phys;
> > +	host_kvm.mm_ops.page_count = hyp_page_count;
> > +	host_kvm.mm_ops.get_page = hyp_get_page;
> > +	host_kvm.mm_ops.put_page = hyp_put_page;
> 
> Same comment I had earlier on about struct initialisation -- I think you
> can use the same trict here for the host_kvm.mm_ops.

+1

> > +
> > +	return 0;
> > +}
> > +
> > +static void prepare_host_vtcr(void)
> > +{
> > +	u32 parange, phys_shift;
> > +	u64 mmfr0, mmfr1;
> > +
> > +	mmfr0 = arm64_ftr_reg_id_aa64mmfr0_el1.sys_val;
> > +	mmfr1 = arm64_ftr_reg_id_aa64mmfr1_el1.sys_val;
> > +
> > +	/* The host stage 2 is id-mapped, so use parange for T0SZ */
> > +	parange = kvm_get_parange(mmfr0);
> > +	phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
> > +
> > +	host_kvm.arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
> > +}
> > +
> > +int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
> > +{
> > +	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
> > +	int ret;
> > +
> > +	prepare_host_vtcr();
> > +	hyp_spin_lock_init(&host_kvm.lock);
> > +
> > +	ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch,
> > +				      &host_kvm.mm_ops);
> > +	if (ret)
> > +		return ret;
> > +
> > +	mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
> > +	mmu->arch = &host_kvm.arch;
> > +	mmu->pgt = &host_kvm.pgt;
> > +	mmu->vmid.vmid_gen = 0;
> > +	mmu->vmid.vmid = 0;
> > +
> > +	return 0;
> > +}
> > +
> > +int __pkvm_prot_finalize(void)
> > +{
> > +	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
> > +	struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
> > +
> > +	params->vttbr = kvm_get_vttbr(mmu);
> > +	params->vtcr = host_kvm.arch.vtcr;
> > +	params->hcr_el2 |= HCR_VM;
> > +	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> > +		params->hcr_el2 |= HCR_FWB;
> > +	kvm_flush_dcache_to_poc(params, sizeof(*params));
> > +
> > +	write_sysreg(params->hcr_el2, hcr_el2);
> > +	__load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
> 
> AFAICT, there's no ISB here. Do we need one before the TLB invalidation?

You mean for the ARM64_WORKAROUND_SPECULATIVE_AT case? __load_stage2()
should already add one for me no?

> > +	__tlbi(vmalls12e1is);
> > +	dsb(ish);
> 
> Given that the TLB is invalidated on the boot path, please can you add
> a comment here about the stale entries which you need to invalidate?

Sure -- that is for HCR bits cached in TLBs for VMID 0. Thinking about
it I could probably reduce the tlbi scope as well.

> Also, does this need to be inner-shareable? I thought this function ran on
> each CPU.

Hmm, correct, nsh should do.

> > +	isb();
> > +
> > +	return 0;
> > +}
> > +
> > +static void host_stage2_unmap_dev_all(void)
> > +{
> > +	struct kvm_pgtable *pgt = &host_kvm.pgt;
> > +	struct memblock_region *reg;
> > +	u64 addr = 0;
> > +	int i;
> > +
> > +	/* Unmap all non-memory regions to recycle the pages */
> > +	for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
> > +		reg = &hyp_memory[i];
> > +		kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
> > +	}
> > +	kvm_pgtable_stage2_unmap(pgt, addr, ULONG_MAX);
> 
> Is this just going to return -ERANGE?

Hrmpf, yes, that wants BIT(pgt->ia_bits) I think. And that wants testing
as well, clearly.

> > +static int host_stage2_idmap(u64 addr)
> > +{
> > +	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
> > +	struct kvm_mem_range range;
> > +	bool is_memory = find_mem_range(addr, &range);
> > +	struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
> > +	int ret;
> > +
> > +	if (is_memory)
> > +		prot |= KVM_PGTABLE_PROT_X;
> > +
> > +	hyp_spin_lock(&host_kvm.lock);
> > +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> > +					      &range, pool);
> > +	if (is_memory || ret != -ENOMEM)
> > +		goto unlock;
> > +	host_stage2_unmap_dev_all();
> > +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> > +					      &range, pool);
> 
> I find this _really_ hard to reason about, as range is passed by reference
> and we don't reset it after the first call returns -ENOMEM for an MMIO
> mapping. Maybe some commentary on why it's still valid here?

Sure, I'll add something. FWIW, that is intended -- -ENOMEM can only be
caused by the call to kvm_pgtable_stage2_map() which leaves the range
untouched. So, as long as we don't release the lock, the range returned
by the first call to kvm_pgtable_stage2_idmap_greedy() should still be
valid. I suppose I could call kvm_pgtable_stage2_map() directly the
second time to make it obvious but I thought this would expose the
internal of kvm_pgtable_stage2_idmap_greedy() a little bit too much.

Thanks,
Quentin
Will Deacon March 8, 2021, 12:46 p.m. UTC | #3
On Mon, Mar 08, 2021 at 09:22:29AM +0000, Quentin Perret wrote:
> On Friday 05 Mar 2021 at 19:29:06 (+0000), Will Deacon wrote:
> > On Tue, Mar 02, 2021 at 02:59:59PM +0000, Quentin Perret wrote:
> > > +static __always_inline void __load_host_stage2(void)
> > > +{
> > > +	if (static_branch_likely(&kvm_protected_mode_initialized))
> > > +		__load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
> > > +	else
> > > +		write_sysreg(0, vttbr_el2);
> > 
> > I realise you've just moved the code, but why is this needed? All we've
> > done is point the stage-2 ttb at 0x0 (i.e. we haven't disabled anything).
> 
> Right, but that is also used for tlb maintenance operations, e.g.
> __tlb_switch_to_host() and friends.

Good point, the VMID is needed in that situation.

> > > +int __pkvm_prot_finalize(void)
> > > +{
> > > +	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
> > > +	struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
> > > +
> > > +	params->vttbr = kvm_get_vttbr(mmu);
> > > +	params->vtcr = host_kvm.arch.vtcr;
> > > +	params->hcr_el2 |= HCR_VM;
> > > +	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> > > +		params->hcr_el2 |= HCR_FWB;
> > > +	kvm_flush_dcache_to_poc(params, sizeof(*params));
> > > +
> > > +	write_sysreg(params->hcr_el2, hcr_el2);
> > > +	__load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
> > 
> > AFAICT, there's no ISB here. Do we need one before the TLB invalidation?
> 
> You mean for the ARM64_WORKAROUND_SPECULATIVE_AT case? __load_stage2()
> should already add one for me no?

__load_stage2() _only_ has the ISB if ARM64_WORKAROUND_SPECULATIVE_AT is
present, whereas I think you need one unconditionall here so that the
system register write has taken effect before the TLB invalidation.

It's similar to the comment at the end of __tlb_switch_to_guest().

Having said that, I do worry that ARM64_WORKAROUND_SPECULATIVE_AT probably
needs a closer look in the world of pKVM, since it currently special-cases
the host.

> > > +	__tlbi(vmalls12e1is);
> > > +	dsb(ish);
> > 
> > Given that the TLB is invalidated on the boot path, please can you add
> > a comment here about the stale entries which you need to invalidate?
> 
> Sure -- that is for HCR bits cached in TLBs for VMID 0. Thinking about
> it I could probably reduce the tlbi scope as well.
> 
> > Also, does this need to be inner-shareable? I thought this function ran on
> > each CPU.
> 
> Hmm, correct, nsh should do.

Cool, then you can do that for both the TLBI and the DSB instructions (and
please add a comment that the invalidation is due to the HCR bits).

> > > +static void host_stage2_unmap_dev_all(void)
> > > +{
> > > +	struct kvm_pgtable *pgt = &host_kvm.pgt;
> > > +	struct memblock_region *reg;
> > > +	u64 addr = 0;
> > > +	int i;
> > > +
> > > +	/* Unmap all non-memory regions to recycle the pages */
> > > +	for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
> > > +		reg = &hyp_memory[i];
> > > +		kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
> > > +	}
> > > +	kvm_pgtable_stage2_unmap(pgt, addr, ULONG_MAX);
> > 
> > Is this just going to return -ERANGE?
> 
> Hrmpf, yes, that wants BIT(pgt->ia_bits) I think. And that wants testing
> as well, clearly.

Agreed, maybe it's worth checking the return value.

> > > +static int host_stage2_idmap(u64 addr)
> > > +{
> > > +	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
> > > +	struct kvm_mem_range range;
> > > +	bool is_memory = find_mem_range(addr, &range);
> > > +	struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
> > > +	int ret;
> > > +
> > > +	if (is_memory)
> > > +		prot |= KVM_PGTABLE_PROT_X;
> > > +
> > > +	hyp_spin_lock(&host_kvm.lock);
> > > +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> > > +					      &range, pool);
> > > +	if (is_memory || ret != -ENOMEM)
> > > +		goto unlock;
> > > +	host_stage2_unmap_dev_all();
> > > +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> > > +					      &range, pool);
> > 
> > I find this _really_ hard to reason about, as range is passed by reference
> > and we don't reset it after the first call returns -ENOMEM for an MMIO
> > mapping. Maybe some commentary on why it's still valid here?
> 
> Sure, I'll add something. FWIW, that is intended -- -ENOMEM can only be
> caused by the call to kvm_pgtable_stage2_map() which leaves the range
> untouched. So, as long as we don't release the lock, the range returned
> by the first call to kvm_pgtable_stage2_idmap_greedy() should still be
> valid. I suppose I could call kvm_pgtable_stage2_map() directly the
> second time to make it obvious but I thought this would expose the
> internal of kvm_pgtable_stage2_idmap_greedy() a little bit too much.

I can see it both ways, but updating the kerneldoc for
kvm_pgtable_stage2_idmap_greedy() to state in which cases the range is
updated and how would be helpful. It just says "negative error code on
failure" at the moment.

Will
Quentin Perret March 8, 2021, 1:38 p.m. UTC | #4
On Monday 08 Mar 2021 at 12:46:07 (+0000), Will Deacon wrote:
> __load_stage2() _only_ has the ISB if ARM64_WORKAROUND_SPECULATIVE_AT is
> present, whereas I think you need one unconditionall here so that the
> system register write has taken effect before the TLB invalidation.
> 
> It's similar to the comment at the end of __tlb_switch_to_guest().
> 
> Having said that, I do worry that ARM64_WORKAROUND_SPECULATIVE_AT probably
> needs a closer look in the world of pKVM, since it currently special-cases
> the host.

Yes, I see that now. I'll start looking into this.

> > > > +	__tlbi(vmalls12e1is);
> > > > +	dsb(ish);
> > > 
> > > Given that the TLB is invalidated on the boot path, please can you add
> > > a comment here about the stale entries which you need to invalidate?
> > 
> > Sure -- that is for HCR bits cached in TLBs for VMID 0. Thinking about
> > it I could probably reduce the tlbi scope as well.
> > 
> > > Also, does this need to be inner-shareable? I thought this function ran on
> > > each CPU.
> > 
> > Hmm, correct, nsh should do.
> 
> Cool, then you can do that for both the TLBI and the DSB instructions (and
> please add a comment that the invalidation is due to the HCR bits).

Sure.

> > > > +static void host_stage2_unmap_dev_all(void)
> > > > +{
> > > > +	struct kvm_pgtable *pgt = &host_kvm.pgt;
> > > > +	struct memblock_region *reg;
> > > > +	u64 addr = 0;
> > > > +	int i;
> > > > +
> > > > +	/* Unmap all non-memory regions to recycle the pages */
> > > > +	for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
> > > > +		reg = &hyp_memory[i];
> > > > +		kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
> > > > +	}
> > > > +	kvm_pgtable_stage2_unmap(pgt, addr, ULONG_MAX);
> > > 
> > > Is this just going to return -ERANGE?
> > 
> > Hrmpf, yes, that wants BIT(pgt->ia_bits) I think. And that wants testing
> > as well, clearly.
> 
> Agreed, maybe it's worth checking the return value.

Ack, and hyp_panic if != 0, but that is probably preferable anyway.

> > > > +static int host_stage2_idmap(u64 addr)
> > > > +{
> > > > +	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
> > > > +	struct kvm_mem_range range;
> > > > +	bool is_memory = find_mem_range(addr, &range);
> > > > +	struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
> > > > +	int ret;
> > > > +
> > > > +	if (is_memory)
> > > > +		prot |= KVM_PGTABLE_PROT_X;
> > > > +
> > > > +	hyp_spin_lock(&host_kvm.lock);
> > > > +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> > > > +					      &range, pool);
> > > > +	if (is_memory || ret != -ENOMEM)
> > > > +		goto unlock;
> > > > +	host_stage2_unmap_dev_all();
> > > > +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> > > > +					      &range, pool);
> > > 
> > > I find this _really_ hard to reason about, as range is passed by reference
> > > and we don't reset it after the first call returns -ENOMEM for an MMIO
> > > mapping. Maybe some commentary on why it's still valid here?
> > 
> > Sure, I'll add something. FWIW, that is intended -- -ENOMEM can only be
> > caused by the call to kvm_pgtable_stage2_map() which leaves the range
> > untouched. So, as long as we don't release the lock, the range returned
> > by the first call to kvm_pgtable_stage2_idmap_greedy() should still be
> > valid. I suppose I could call kvm_pgtable_stage2_map() directly the
> > second time to make it obvious but I thought this would expose the
> > internal of kvm_pgtable_stage2_idmap_greedy() a little bit too much.
> 
> I can see it both ways, but updating the kerneldoc for
> kvm_pgtable_stage2_idmap_greedy() to state in which cases the range is
> updated and how would be helpful. It just says "negative error code on
> failure" at the moment.

Alternatively I could expose the 'reduce' path (maybe with another name
e.g. stage2_find_compatible_range() or so) and remove the
kvm_pgtable_stage2_idmap_greedy() wrapper. So it'd be the caller's
responsibility to not release the lock in between
stage2_find_compatible_range() and kvm_pgtable_stage2_map() for
instance, but that sounds reasonable to me. And that would make it
explicit it's the _map() path that failed with -ENOMEM, and that the
range can be re-used the second time.

Thoughts?

Thanks,
Quentin
Will Deacon March 8, 2021, 1:52 p.m. UTC | #5
On Mon, Mar 08, 2021 at 01:38:07PM +0000, Quentin Perret wrote:
> On Monday 08 Mar 2021 at 12:46:07 (+0000), Will Deacon wrote:
> > > > > +static int host_stage2_idmap(u64 addr)
> > > > > +{
> > > > > +	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
> > > > > +	struct kvm_mem_range range;
> > > > > +	bool is_memory = find_mem_range(addr, &range);
> > > > > +	struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
> > > > > +	int ret;
> > > > > +
> > > > > +	if (is_memory)
> > > > > +		prot |= KVM_PGTABLE_PROT_X;
> > > > > +
> > > > > +	hyp_spin_lock(&host_kvm.lock);
> > > > > +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> > > > > +					      &range, pool);
> > > > > +	if (is_memory || ret != -ENOMEM)
> > > > > +		goto unlock;
> > > > > +	host_stage2_unmap_dev_all();
> > > > > +	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
> > > > > +					      &range, pool);
> > > > 
> > > > I find this _really_ hard to reason about, as range is passed by reference
> > > > and we don't reset it after the first call returns -ENOMEM for an MMIO
> > > > mapping. Maybe some commentary on why it's still valid here?
> > > 
> > > Sure, I'll add something. FWIW, that is intended -- -ENOMEM can only be
> > > caused by the call to kvm_pgtable_stage2_map() which leaves the range
> > > untouched. So, as long as we don't release the lock, the range returned
> > > by the first call to kvm_pgtable_stage2_idmap_greedy() should still be
> > > valid. I suppose I could call kvm_pgtable_stage2_map() directly the
> > > second time to make it obvious but I thought this would expose the
> > > internal of kvm_pgtable_stage2_idmap_greedy() a little bit too much.
> > 
> > I can see it both ways, but updating the kerneldoc for
> > kvm_pgtable_stage2_idmap_greedy() to state in which cases the range is
> > updated and how would be helpful. It just says "negative error code on
> > failure" at the moment.
> 
> Alternatively I could expose the 'reduce' path (maybe with another name
> e.g. stage2_find_compatible_range() or so) and remove the
> kvm_pgtable_stage2_idmap_greedy() wrapper. So it'd be the caller's
> responsibility to not release the lock in between
> stage2_find_compatible_range() and kvm_pgtable_stage2_map() for
> instance, but that sounds reasonable to me. And that would make it
> explicit it's the _map() path that failed with -ENOMEM, and that the
> range can be re-used the second time.
> 
> Thoughts?

I suppose it depends on whether or not you reckon this could be optimised
into a single-pass of the page-table. If not, then splitting it up makes
sense to me (and actually, it's not like this has tonnes of callers so
even if we changed things in future it wouldn't be too hard to fix them up).

Will
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 6dce860f8bca..b127af02bd45 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -61,6 +61,7 @@ 
 #define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings		16
 #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping	17
 #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector		18
+#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize		19
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/kvm_cpufeature.h b/arch/arm64/include/asm/kvm_cpufeature.h
index d34f85cba358..74043a149322 100644
--- a/arch/arm64/include/asm/kvm_cpufeature.h
+++ b/arch/arm64/include/asm/kvm_cpufeature.h
@@ -15,3 +15,5 @@ 
 #endif
 
 KVM_HYP_CPU_FTR_REG(SYS_CTR_EL0, arm64_ftr_reg_ctrel0)
+KVM_HYP_CPU_FTR_REG(SYS_ID_AA64MMFR0_EL1, arm64_ftr_reg_id_aa64mmfr0_el1)
+KVM_HYP_CPU_FTR_REG(SYS_ID_AA64MMFR1_EL1, arm64_ftr_reg_id_aa64mmfr1_el1)
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 940c378fa837..d5dc2b792651 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -131,6 +131,9 @@  KVM_NVHE_ALIAS(__hyp_bss_end);
 KVM_NVHE_ALIAS(__hyp_rodata_start);
 KVM_NVHE_ALIAS(__hyp_rodata_end);
 
+/* pKVM static key */
+KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
+
 #endif /* CONFIG_KVM */
 
 #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b6a818f88051..a31c56bc55b3 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1889,12 +1889,22 @@  static int init_hyp_mode(void)
 	return err;
 }
 
+void _kvm_host_prot_finalize(void *discard)
+{
+	WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize));
+}
+
 static int finalize_hyp_mode(void)
 {
 	if (!is_protected_kvm_enabled())
 		return 0;
 
+	/*
+	 * Flip the static key upfront as that may no longer be possible
+	 * once the host stage 2 is installed.
+	 */
 	static_branch_enable(&kvm_protected_mode_initialized);
+	on_each_cpu(_kvm_host_prot_finalize, NULL, 1);
 
 	return 0;
 }
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
new file mode 100644
index 000000000000..d293cb328cc4
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -0,0 +1,34 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#ifndef __KVM_NVHE_MEM_PROTECT__
+#define __KVM_NVHE_MEM_PROTECT__
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/virt.h>
+#include <nvhe/spinlock.h>
+
+struct host_kvm {
+	struct kvm_arch arch;
+	struct kvm_pgtable pgt;
+	struct kvm_pgtable_mm_ops mm_ops;
+	hyp_spinlock_t lock;
+};
+extern struct host_kvm host_kvm;
+
+int __pkvm_prot_finalize(void);
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
+
+static __always_inline void __load_host_stage2(void)
+{
+	if (static_branch_likely(&kvm_protected_mode_initialized))
+		__load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+	else
+		write_sysreg(0, vttbr_el2);
+}
+#endif /* __KVM_NVHE_MEM_PROTECT__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index e204ea77ab27..ce49795324a7 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -14,7 +14,7 @@  lib-objs := $(addprefix ../../../lib/, $(lib-objs))
 
 obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
 	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \
-	 cache.o cpufeature.o setup.o mm.o
+	 cache.o cpufeature.o setup.o mm.o mem_protect.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
 obj-y += $(lib-objs)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index f312672d895e..6fa01b04954f 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -119,6 +119,7 @@  alternative_else_nop_endif
 
 	/* Invalidate the stale TLBs from Bootloader */
 	tlbi	alle2
+	tlbi	vmalls12e1
 	dsb	sy
 
 	/*
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index ae6503c9be15..f47028d3fd0a 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -13,6 +13,7 @@ 
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+#include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/trap_handler.h>
 
@@ -151,6 +152,10 @@  static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ct
 	cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
 }
 
+static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
+{
+	cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
+}
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -174,6 +179,7 @@  static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_cpu_set_vector),
 	HANDLE_FUNC(__pkvm_create_mappings),
 	HANDLE_FUNC(__pkvm_create_private_mapping),
+	HANDLE_FUNC(__pkvm_prot_finalize),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
@@ -226,6 +232,11 @@  void handle_trap(struct kvm_cpu_context *host_ctxt)
 	case ESR_ELx_EC_SMC64:
 		handle_host_smc(host_ctxt);
 		break;
+	case ESR_ELx_EC_IABT_LOW:
+		fallthrough;
+	case ESR_ELx_EC_DABT_LOW:
+		handle_host_mem_abort(host_ctxt);
+		break;
 	default:
 		hyp_panic();
 	}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
new file mode 100644
index 000000000000..2252ad1a8945
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -0,0 +1,213 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_cpufeature.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/stage2_pgtable.h>
+
+#include <hyp/switch.h>
+
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+
+extern unsigned long hyp_nr_cpus;
+struct host_kvm host_kvm;
+
+struct hyp_pool host_s2_mem;
+struct hyp_pool host_s2_dev;
+
+static void *host_s2_zalloc_pages_exact(size_t size)
+{
+	return hyp_alloc_pages(&host_s2_mem, get_order(size));
+}
+
+static void *host_s2_zalloc_page(void *pool)
+{
+	return hyp_alloc_pages(pool, 0);
+}
+
+static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+	unsigned long nr_pages, pfn;
+	int ret;
+
+	pfn = hyp_virt_to_pfn(mem_pgt_pool);
+	nr_pages = host_s2_mem_pgtable_pages();
+	ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
+	if (ret)
+		return ret;
+
+	pfn = hyp_virt_to_pfn(dev_pgt_pool);
+	nr_pages = host_s2_dev_pgtable_pages();
+	ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
+	if (ret)
+		return ret;
+
+	host_kvm.mm_ops.zalloc_pages_exact = host_s2_zalloc_pages_exact;
+	host_kvm.mm_ops.zalloc_page = host_s2_zalloc_page;
+	host_kvm.mm_ops.phys_to_virt = hyp_phys_to_virt;
+	host_kvm.mm_ops.virt_to_phys = hyp_virt_to_phys;
+	host_kvm.mm_ops.page_count = hyp_page_count;
+	host_kvm.mm_ops.get_page = hyp_get_page;
+	host_kvm.mm_ops.put_page = hyp_put_page;
+
+	return 0;
+}
+
+static void prepare_host_vtcr(void)
+{
+	u32 parange, phys_shift;
+	u64 mmfr0, mmfr1;
+
+	mmfr0 = arm64_ftr_reg_id_aa64mmfr0_el1.sys_val;
+	mmfr1 = arm64_ftr_reg_id_aa64mmfr1_el1.sys_val;
+
+	/* The host stage 2 is id-mapped, so use parange for T0SZ */
+	parange = kvm_get_parange(mmfr0);
+	phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+	host_kvm.arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+}
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+	int ret;
+
+	prepare_host_vtcr();
+	hyp_spin_lock_init(&host_kvm.lock);
+
+	ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
+	if (ret)
+		return ret;
+
+	ret = kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch,
+				      &host_kvm.mm_ops);
+	if (ret)
+		return ret;
+
+	mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
+	mmu->arch = &host_kvm.arch;
+	mmu->pgt = &host_kvm.pgt;
+	mmu->vmid.vmid_gen = 0;
+	mmu->vmid.vmid = 0;
+
+	return 0;
+}
+
+int __pkvm_prot_finalize(void)
+{
+	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+	struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+
+	params->vttbr = kvm_get_vttbr(mmu);
+	params->vtcr = host_kvm.arch.vtcr;
+	params->hcr_el2 |= HCR_VM;
+	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+		params->hcr_el2 |= HCR_FWB;
+	kvm_flush_dcache_to_poc(params, sizeof(*params));
+
+	write_sysreg(params->hcr_el2, hcr_el2);
+	__load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+
+	__tlbi(vmalls12e1is);
+	dsb(ish);
+	isb();
+
+	return 0;
+}
+
+static void host_stage2_unmap_dev_all(void)
+{
+	struct kvm_pgtable *pgt = &host_kvm.pgt;
+	struct memblock_region *reg;
+	u64 addr = 0;
+	int i;
+
+	/* Unmap all non-memory regions to recycle the pages */
+	for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
+		reg = &hyp_memory[i];
+		kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
+	}
+	kvm_pgtable_stage2_unmap(pgt, addr, ULONG_MAX);
+}
+
+static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+{
+	int cur, left = 0, right = hyp_memblock_nr;
+	struct memblock_region *reg;
+	phys_addr_t end;
+
+	range->start = 0;
+	range->end = ULONG_MAX;
+
+	/* The list of memblock regions is sorted, binary search it */
+	while (left < right) {
+		cur = (left + right) >> 1;
+		reg = &hyp_memory[cur];
+		end = reg->base + reg->size;
+		if (addr < reg->base) {
+			right = cur;
+			range->end = reg->base;
+		} else if (addr >= end) {
+			left = cur + 1;
+			range->start = end;
+		} else {
+			range->start = reg->base;
+			range->end = end;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static int host_stage2_idmap(u64 addr)
+{
+	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
+	struct kvm_mem_range range;
+	bool is_memory = find_mem_range(addr, &range);
+	struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
+	int ret;
+
+	if (is_memory)
+		prot |= KVM_PGTABLE_PROT_X;
+
+	hyp_spin_lock(&host_kvm.lock);
+	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
+					      &range, pool);
+	if (is_memory || ret != -ENOMEM)
+		goto unlock;
+	host_stage2_unmap_dev_all();
+	ret = kvm_pgtable_stage2_idmap_greedy(&host_kvm.pgt, addr, prot,
+					      &range, pool);
+unlock:
+	hyp_spin_unlock(&host_kvm.lock);
+
+	return ret;
+}
+
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+	struct kvm_vcpu_fault_info fault;
+	u64 esr, addr;
+	int ret = 0;
+
+	esr = read_sysreg_el2(SYS_ESR);
+	if (!__get_fault_info(esr, &fault))
+		hyp_panic();
+
+	addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
+	ret = host_stage2_idmap(addr);
+	if (ret && ret != -EAGAIN)
+		hyp_panic();
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 7e923b25271c..94b9f14491f9 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -12,6 +12,7 @@ 
 #include <nvhe/early_alloc.h>
 #include <nvhe/gfp.h>
 #include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/trap_handler.h>
 
@@ -157,6 +158,10 @@  void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
+	ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base);
+	if (ret)
+		goto out;
+
 	pkvm_pgtable_mm_ops.zalloc_page = hyp_zalloc_hyp_page;
 	pkvm_pgtable_mm_ops.phys_to_virt = hyp_phys_to_virt;
 	pkvm_pgtable_mm_ops.virt_to_phys = hyp_virt_to_phys;
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 979a76cdf9fb..31bc1a843bf8 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -28,6 +28,8 @@ 
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 
+#include <nvhe/mem_protect.h>
+
 /* Non-VHE specific context */
 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
 DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -102,11 +104,6 @@  static void __deactivate_traps(struct kvm_vcpu *vcpu)
 	write_sysreg(__kvm_hyp_host_vector, vbar_el2);
 }
 
-static void __load_host_stage2(void)
-{
-	write_sysreg(0, vttbr_el2);
-}
-
 /* Save VGICv3 state on non-VHE systems */
 static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index fbde89a2c6e8..255a23a1b2db 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -8,6 +8,8 @@ 
 #include <asm/kvm_mmu.h>
 #include <asm/tlbflush.h>
 
+#include <nvhe/mem_protect.h>
+
 struct tlb_inv_context {
 	u64		tcr;
 };
@@ -43,7 +45,7 @@  static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
 
 static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
 {
-	write_sysreg(0, vttbr_el2);
+	__load_host_stage2();
 
 	if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
 		/* Ensure write of the host VMID */