diff mbox series

[v4,14/25] KVM: arm64: Add per-cpu fixmap infrastructure at EL2

Message ID 20221017115209.2099-15-will@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Introduce pKVM hyp VM and vCPU state at EL2 | expand

Commit Message

Will Deacon Oct. 17, 2022, 11:51 a.m. UTC
From: Quentin Perret <qperret@google.com>

Mapping pages in a guest page-table from within the pKVM hypervisor at
EL2 may require cache maintenance to ensure that the initialised page
contents is visible even to non-cacheable (e.g. MMU-off) accesses from
the guest.

In preparation for performing this maintenance at EL2, introduce a
per-vCPU fixmap which allows the pKVM hypervisor to map guest pages
temporarily into its stage-1 page-table for the purposes of cache
maintenance and, in future, poisoning on the reclaim path. The use of a
fixmap avoids the need for memory allocation or locking on the map()
path.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kvm_pgtable.h          | 14 +++
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  2 +
 arch/arm64/kvm/hyp/include/nvhe/mm.h          |  4 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         |  1 -
 arch/arm64/kvm/hyp/nvhe/mm.c                  | 94 +++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c               |  4 +
 arch/arm64/kvm/hyp/pgtable.c                  | 12 ---
 7 files changed, 118 insertions(+), 13 deletions(-)

Comments

Mark Rutland Oct. 18, 2022, 11:06 a.m. UTC | #1
On Mon, Oct 17, 2022 at 12:51:58PM +0100, Will Deacon wrote:
> From: Quentin Perret <qperret@google.com>
> 
> Mapping pages in a guest page-table from within the pKVM hypervisor at
> EL2 may require cache maintenance to ensure that the initialised page
> contents is visible even to non-cacheable (e.g. MMU-off) accesses from
> the guest.
> 
> In preparation for performing this maintenance at EL2, introduce a
> per-vCPU fixmap which allows the pKVM hypervisor to map guest pages
> temporarily into its stage-1 page-table for the purposes of cache
> maintenance and, in future, poisoning on the reclaim path. The use of a
> fixmap avoids the need for memory allocation or locking on the map()
> path.
> 
> Tested-by: Vincent Donnefort <vdonnefort@google.com>
> Signed-off-by: Quentin Perret <qperret@google.com>
> Signed-off-by: Will Deacon <will@kernel.org>
> ---
>  arch/arm64/include/asm/kvm_pgtable.h          | 14 +++
>  arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  2 +
>  arch/arm64/kvm/hyp/include/nvhe/mm.h          |  4 +
>  arch/arm64/kvm/hyp/nvhe/mem_protect.c         |  1 -
>  arch/arm64/kvm/hyp/nvhe/mm.c                  | 94 +++++++++++++++++++
>  arch/arm64/kvm/hyp/nvhe/setup.c               |  4 +
>  arch/arm64/kvm/hyp/pgtable.c                  | 12 ---
>  7 files changed, 118 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 4f6d79fe4352..b2a886c9e78d 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -30,6 +30,8 @@ typedef u64 kvm_pte_t;
>  #define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
>  #define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
>  
> +#define KVM_PHYS_INVALID		(-1ULL)
> +
>  static inline bool kvm_pte_valid(kvm_pte_t pte)
>  {
>  	return pte & KVM_PTE_VALID;
> @@ -45,6 +47,18 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
>  	return pa;
>  }
>  
> +static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
> +{
> +	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
> +
> +	if (PAGE_SHIFT == 16) {
> +		pa &= GENMASK(51, 48);
> +		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
> +	}
> +
> +	return pte;
> +}
> +
>  static inline u64 kvm_granule_shift(u32 level)
>  {
>  	/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
> diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> index ce9a796a85ee..ef31a1872c93 100644
> --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> @@ -59,6 +59,8 @@ enum pkvm_component_id {
>  	PKVM_ID_HYP,
>  };
>  
> +extern unsigned long hyp_nr_cpus;
> +
>  int __pkvm_prot_finalize(void);
>  int __pkvm_host_share_hyp(u64 pfn);
>  int __pkvm_host_unshare_hyp(u64 pfn);
> diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
> index b2ee6d5df55b..d5ec972b5c1e 100644
> --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
> +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
> @@ -13,6 +13,10 @@
>  extern struct kvm_pgtable pkvm_pgtable;
>  extern hyp_spinlock_t pkvm_pgd_lock;
>  
> +int hyp_create_pcpu_fixmap(void);
> +void *hyp_fixmap_map(phys_addr_t phys);
> +void hyp_fixmap_unmap(void);
> +
>  int hyp_create_idmap(u32 hyp_va_bits);
>  int hyp_map_vectors(void);
>  int hyp_back_vmemmap(phys_addr_t back);
> diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> index 2ef6aaa21ba5..1c38451050e5 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> @@ -21,7 +21,6 @@
>  
>  #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
>  
> -extern unsigned long hyp_nr_cpus;
>  struct host_mmu host_mmu;
>  
>  static struct hyp_pool host_s2_pool;
> diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
> index d3a3b47181de..b77215630d5c 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mm.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mm.c
> @@ -14,6 +14,7 @@
>  #include <nvhe/early_alloc.h>
>  #include <nvhe/gfp.h>
>  #include <nvhe/memory.h>
> +#include <nvhe/mem_protect.h>
>  #include <nvhe/mm.h>
>  #include <nvhe/spinlock.h>
>  
> @@ -25,6 +26,12 @@ unsigned int hyp_memblock_nr;
>  
>  static u64 __io_map_base;
>  
> +struct hyp_fixmap_slot {
> +	u64 addr;
> +	kvm_pte_t *ptep;
> +};
> +static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);
> +
>  static int __pkvm_create_mappings(unsigned long start, unsigned long size,
>  				  unsigned long phys, enum kvm_pgtable_prot prot)
>  {
> @@ -212,6 +219,93 @@ int hyp_map_vectors(void)
>  	return 0;
>  }
>  
> +void *hyp_fixmap_map(phys_addr_t phys)
> +{
> +	struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
> +	kvm_pte_t pte, *ptep = slot->ptep;
> +
> +	pte = *ptep;
> +	pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
> +	pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
> +	WRITE_ONCE(*ptep, pte);
> +	dsb(nshst);
> +
> +	return (void *)slot->addr;
> +}
> +
> +static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
> +{
> +	kvm_pte_t *ptep = slot->ptep;
> +	u64 addr = slot->addr;
> +
> +	WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
> +	dsb(nshst);
> +	__tlbi_level(vale2, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
> +	dsb(nsh);
> +	isb();
> +}

Does each CPU have independent Stage-1 tables at EL2? i.e. each has a distinct
root table?

If the tables are shared, you need broadcast maintenance and ISH barriers here,
or you risk the usual issues with asynchronous MMU behaviour.

If those are per-cpu, sorry for the noise!

Thanks,
Mark.

> +
> +void hyp_fixmap_unmap(void)
> +{
> +	fixmap_clear_slot(this_cpu_ptr(&fixmap_slots));
> +}
> +
> +static int __create_fixmap_slot_cb(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> +				   enum kvm_pgtable_walk_flags flag,
> +				   void * const arg)
> +{
> +	struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)arg);
> +
> +	if (!kvm_pte_valid(*ptep) || level != KVM_PGTABLE_MAX_LEVELS - 1)
> +		return -EINVAL;
> +
> +	slot->addr = addr;
> +	slot->ptep = ptep;
> +
> +	/*
> +	 * Clear the PTE, but keep the page-table page refcount elevated to
> +	 * prevent it from ever being freed. This lets us manipulate the PTEs
> +	 * by hand safely without ever needing to allocate memory.
> +	 */
> +	fixmap_clear_slot(slot);
> +
> +	return 0;
> +}
> +
> +static int create_fixmap_slot(u64 addr, u64 cpu)
> +{
> +	struct kvm_pgtable_walker walker = {
> +		.cb	= __create_fixmap_slot_cb,
> +		.flags	= KVM_PGTABLE_WALK_LEAF,
> +		.arg = (void *)cpu,
> +	};
> +
> +	return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
> +}
> +
> +int hyp_create_pcpu_fixmap(void)
> +{
> +	unsigned long addr, i;
> +	int ret;
> +
> +	for (i = 0; i < hyp_nr_cpus; i++) {
> +		ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr);
> +		if (ret)
> +			return ret;
> +
> +		ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE,
> +					  __hyp_pa(__hyp_bss_start), PAGE_HYP);
> +		if (ret)
> +			return ret;
> +
> +		ret = create_fixmap_slot(addr, i);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
>  int hyp_create_idmap(u32 hyp_va_bits)
>  {
>  	unsigned long start, end;
> diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
> index 2be72fbe7279..0f69c1393416 100644
> --- a/arch/arm64/kvm/hyp/nvhe/setup.c
> +++ b/arch/arm64/kvm/hyp/nvhe/setup.c
> @@ -321,6 +321,10 @@ void __noreturn __pkvm_init_finalise(void)
>  	if (ret)
>  		goto out;
>  
> +	ret = hyp_create_pcpu_fixmap();
> +	if (ret)
> +		goto out;
> +
>  	pkvm_hyp_vm_table_init(vm_table_base);
>  out:
>  	/*
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index a1a27f88a312..2bcb2d5903ba 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -57,8 +57,6 @@ struct kvm_pgtable_walk_data {
>  	u64				end;
>  };
>  
> -#define KVM_PHYS_INVALID (-1ULL)
> -
>  static bool kvm_phys_is_valid(u64 phys)
>  {
>  	return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
> @@ -122,16 +120,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level)
>  	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
>  }
>  
> -static kvm_pte_t kvm_phys_to_pte(u64 pa)
> -{
> -	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
> -
> -	if (PAGE_SHIFT == 16)
> -		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
> -
> -	return pte;
> -}
> -
>  static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
>  {
>  	return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
> -- 
> 2.38.0.413.g74048e4d9e-goog
>
Will Deacon Oct. 18, 2022, 2:05 p.m. UTC | #2
Hi Mark,

Cheers for having a look.

On Tue, Oct 18, 2022 at 12:06:14PM +0100, Mark Rutland wrote:
> On Mon, Oct 17, 2022 at 12:51:58PM +0100, Will Deacon wrote:
> > diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
> > index d3a3b47181de..b77215630d5c 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/mm.c
> > +++ b/arch/arm64/kvm/hyp/nvhe/mm.c
> > @@ -14,6 +14,7 @@
> >  #include <nvhe/early_alloc.h>
> >  #include <nvhe/gfp.h>
> >  #include <nvhe/memory.h>
> > +#include <nvhe/mem_protect.h>
> >  #include <nvhe/mm.h>
> >  #include <nvhe/spinlock.h>
> >  
> > @@ -25,6 +26,12 @@ unsigned int hyp_memblock_nr;
> >  
> >  static u64 __io_map_base;
> >  
> > +struct hyp_fixmap_slot {
> > +	u64 addr;
> > +	kvm_pte_t *ptep;
> > +};
> > +static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);
> > +
> >  static int __pkvm_create_mappings(unsigned long start, unsigned long size,
> >  				  unsigned long phys, enum kvm_pgtable_prot prot)
> >  {
> > @@ -212,6 +219,93 @@ int hyp_map_vectors(void)
> >  	return 0;
> >  }
> >  
> > +void *hyp_fixmap_map(phys_addr_t phys)
> > +{
> > +	struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
> > +	kvm_pte_t pte, *ptep = slot->ptep;
> > +
> > +	pte = *ptep;
> > +	pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
> > +	pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
> > +	WRITE_ONCE(*ptep, pte);
> > +	dsb(nshst);
> > +
> > +	return (void *)slot->addr;
> > +}
> > +
> > +static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
> > +{
> > +	kvm_pte_t *ptep = slot->ptep;
> > +	u64 addr = slot->addr;
> > +
> > +	WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
> > +	dsb(nshst);
> > +	__tlbi_level(vale2, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
> > +	dsb(nsh);
> > +	isb();
> > +}
> 
> Does each CPU have independent Stage-1 tables at EL2? i.e. each has a distinct
> root table?

No, the CPUs share the same stage-1 table at EL2.

> If the tables are shared, you need broadcast maintenance and ISH barriers here,
> or you risk the usual issues with asynchronous MMU behaviour.

Can you elaborate a bit, please? What we're trying to do is reserve a page
of VA space for each CPU, which is only ever accessed explicitly by that
CPU using a normal memory mapping. The fixmap code therefore just updates
the relevant leaf entry for the CPU on which we're running and the TLBI
is there to ensure that the new mapping takes effect.

If another CPU speculatively walks another CPU's fixmap slot, then I agree
that it could access that page after the slot had been cleared. Although
I can see theoretical security arguments around avoiding that situation,
there's a very real performance cost to broadcast invalidation that we
were hoping to avoid on this fast path.

Of course, in the likely event that I've purged "the usual issues" from
my head and we need broadcasting for _correctness_, then we'll just have
to suck it up!

Cheers,

Will
Mark Rutland Oct. 18, 2022, 4:52 p.m. UTC | #3
On Tue, Oct 18, 2022 at 03:05:14PM +0100, Will Deacon wrote:
> Hi Mark,
> 
> Cheers for having a look.
> 
> On Tue, Oct 18, 2022 at 12:06:14PM +0100, Mark Rutland wrote:
> > On Mon, Oct 17, 2022 at 12:51:58PM +0100, Will Deacon wrote:
> > > diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
> > > index d3a3b47181de..b77215630d5c 100644
> > > --- a/arch/arm64/kvm/hyp/nvhe/mm.c
> > > +++ b/arch/arm64/kvm/hyp/nvhe/mm.c
> > > @@ -14,6 +14,7 @@
> > >  #include <nvhe/early_alloc.h>
> > >  #include <nvhe/gfp.h>
> > >  #include <nvhe/memory.h>
> > > +#include <nvhe/mem_protect.h>
> > >  #include <nvhe/mm.h>
> > >  #include <nvhe/spinlock.h>
> > >  
> > > @@ -25,6 +26,12 @@ unsigned int hyp_memblock_nr;
> > >  
> > >  static u64 __io_map_base;
> > >  
> > > +struct hyp_fixmap_slot {
> > > +	u64 addr;
> > > +	kvm_pte_t *ptep;
> > > +};
> > > +static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);
> > > +
> > >  static int __pkvm_create_mappings(unsigned long start, unsigned long size,
> > >  				  unsigned long phys, enum kvm_pgtable_prot prot)
> > >  {
> > > @@ -212,6 +219,93 @@ int hyp_map_vectors(void)
> > >  	return 0;
> > >  }
> > >  
> > > +void *hyp_fixmap_map(phys_addr_t phys)
> > > +{
> > > +	struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
> > > +	kvm_pte_t pte, *ptep = slot->ptep;
> > > +
> > > +	pte = *ptep;
> > > +	pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
> > > +	pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
> > > +	WRITE_ONCE(*ptep, pte);
> > > +	dsb(nshst);
> > > +
> > > +	return (void *)slot->addr;
> > > +}
> > > +
> > > +static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
> > > +{
> > > +	kvm_pte_t *ptep = slot->ptep;
> > > +	u64 addr = slot->addr;
> > > +
> > > +	WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
> > > +	dsb(nshst);
> > > +	__tlbi_level(vale2, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
> > > +	dsb(nsh);
> > > +	isb();
> > > +}
> > 
> > Does each CPU have independent Stage-1 tables at EL2? i.e. each has a distinct
> > root table?
> 
> No, the CPUs share the same stage-1 table at EL2.

Ah, then I think there's a problem here.

> > If the tables are shared, you need broadcast maintenance and ISH barriers here,
> > or you risk the usual issues with asynchronous MMU behaviour.
> 
> Can you elaborate a bit, please? What we're trying to do is reserve a page
> of VA space for each CPU, which is only ever accessed explicitly by that
> CPU using a normal memory mapping. The fixmap code therefore just updates
> the relevant leaf entry for the CPU on which we're running and the TLBI
> is there to ensure that the new mapping takes effect.
> 
> If another CPU speculatively walks another CPU's fixmap slot, then I agree
> that it could access that page after the slot had been cleared. Although
> I can see theoretical security arguments around avoiding that situation,
> there's a very real performance cost to broadcast invalidation that we
> were hoping to avoid on this fast path.

The issue is that any CPU could walk any of these entries at any time
for any reason, and without broadcast maintenance we'd be violating the
Break-Before-Make requirements. That permits a number of things,
including "amalgamation", which would permit the CPU to consume some
arbitrary function of the old+new entries. Among other things, that can
permit accesses to entirely bogus physical addresses that weren't in
either entry (e.g. making speculative accesses to arbitrary device
addresses).

For correctness, you need the maintenance to be broadcast to all PEs
which could observe the old and new entries.

> Of course, in the likely event that I've purged "the usual issues" from
> my head and we need broadcasting for _correctness_, then we'll just have
> to suck it up!

As above, I believe you need this for correctness.

I'm not sure if FEAT_BBM level 2 gives you the necessary properties to
relax this on some HW.

Thanks,
Mark.
Will Deacon Oct. 19, 2022, 12:01 p.m. UTC | #4
On Tue, Oct 18, 2022 at 05:52:18PM +0100, Mark Rutland wrote:
> On Tue, Oct 18, 2022 at 03:05:14PM +0100, Will Deacon wrote:
> > On Tue, Oct 18, 2022 at 12:06:14PM +0100, Mark Rutland wrote:
> > > If the tables are shared, you need broadcast maintenance and ISH barriers here,
> > > or you risk the usual issues with asynchronous MMU behaviour.
> > 
> > Can you elaborate a bit, please? What we're trying to do is reserve a page
> > of VA space for each CPU, which is only ever accessed explicitly by that
> > CPU using a normal memory mapping. The fixmap code therefore just updates
> > the relevant leaf entry for the CPU on which we're running and the TLBI
> > is there to ensure that the new mapping takes effect.
> > 
> > If another CPU speculatively walks another CPU's fixmap slot, then I agree
> > that it could access that page after the slot had been cleared. Although
> > I can see theoretical security arguments around avoiding that situation,
> > there's a very real performance cost to broadcast invalidation that we
> > were hoping to avoid on this fast path.
> 
> The issue is that any CPU could walk any of these entries at any time
> for any reason, and without broadcast maintenance we'd be violating the
> Break-Before-Make requirements. That permits a number of things,
> including "amalgamation", which would permit the CPU to consume some
> arbitrary function of the old+new entries. Among other things, that can
> permit accesses to entirely bogus physical addresses that weren't in
> either entry (e.g. making speculative accesses to arbitrary device
> addresses).
> 
> For correctness, you need the maintenance to be broadcast to all PEs
> which could observe the old and new entries.

Urgh, I had definitely purged that one. Thanks for the refresher.

> > Of course, in the likely event that I've purged "the usual issues" from
> > my head and we need broadcasting for _correctness_, then we'll just have
> > to suck it up!
> 
> As above, I believe you need this for correctness.
> 
> I'm not sure if FEAT_BBM level 2 gives you the necessary properties to
> relax this on some HW.

I'll seek clarification on this, as that amalgamation text feels a bit
over-reaching to me (particularly when combined with the fact that the
CPU still has to respect things like the NS bit) and I suspect that we
wouldn't see it in practice for this case.

But for now, I'll add the broadcasting so we don't block the series.

Will
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 4f6d79fe4352..b2a886c9e78d 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -30,6 +30,8 @@  typedef u64 kvm_pte_t;
 #define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
 #define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
 
+#define KVM_PHYS_INVALID		(-1ULL)
+
 static inline bool kvm_pte_valid(kvm_pte_t pte)
 {
 	return pte & KVM_PTE_VALID;
@@ -45,6 +47,18 @@  static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
 	return pa;
 }
 
+static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+	if (PAGE_SHIFT == 16) {
+		pa &= GENMASK(51, 48);
+		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+	}
+
+	return pte;
+}
+
 static inline u64 kvm_granule_shift(u32 level)
 {
 	/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index ce9a796a85ee..ef31a1872c93 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -59,6 +59,8 @@  enum pkvm_component_id {
 	PKVM_ID_HYP,
 };
 
+extern unsigned long hyp_nr_cpus;
+
 int __pkvm_prot_finalize(void);
 int __pkvm_host_share_hyp(u64 pfn);
 int __pkvm_host_unshare_hyp(u64 pfn);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index b2ee6d5df55b..d5ec972b5c1e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -13,6 +13,10 @@ 
 extern struct kvm_pgtable pkvm_pgtable;
 extern hyp_spinlock_t pkvm_pgd_lock;
 
+int hyp_create_pcpu_fixmap(void);
+void *hyp_fixmap_map(phys_addr_t phys);
+void hyp_fixmap_unmap(void);
+
 int hyp_create_idmap(u32 hyp_va_bits);
 int hyp_map_vectors(void);
 int hyp_back_vmemmap(phys_addr_t back);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 2ef6aaa21ba5..1c38451050e5 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -21,7 +21,6 @@ 
 
 #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
 
-extern unsigned long hyp_nr_cpus;
 struct host_mmu host_mmu;
 
 static struct hyp_pool host_s2_pool;
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index d3a3b47181de..b77215630d5c 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -14,6 +14,7 @@ 
 #include <nvhe/early_alloc.h>
 #include <nvhe/gfp.h>
 #include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/spinlock.h>
 
@@ -25,6 +26,12 @@  unsigned int hyp_memblock_nr;
 
 static u64 __io_map_base;
 
+struct hyp_fixmap_slot {
+	u64 addr;
+	kvm_pte_t *ptep;
+};
+static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);
+
 static int __pkvm_create_mappings(unsigned long start, unsigned long size,
 				  unsigned long phys, enum kvm_pgtable_prot prot)
 {
@@ -212,6 +219,93 @@  int hyp_map_vectors(void)
 	return 0;
 }
 
+void *hyp_fixmap_map(phys_addr_t phys)
+{
+	struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
+	kvm_pte_t pte, *ptep = slot->ptep;
+
+	pte = *ptep;
+	pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
+	pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
+	WRITE_ONCE(*ptep, pte);
+	dsb(nshst);
+
+	return (void *)slot->addr;
+}
+
+static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
+{
+	kvm_pte_t *ptep = slot->ptep;
+	u64 addr = slot->addr;
+
+	WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
+	dsb(nshst);
+	__tlbi_level(vale2, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
+	dsb(nsh);
+	isb();
+}
+
+void hyp_fixmap_unmap(void)
+{
+	fixmap_clear_slot(this_cpu_ptr(&fixmap_slots));
+}
+
+static int __create_fixmap_slot_cb(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+				   enum kvm_pgtable_walk_flags flag,
+				   void * const arg)
+{
+	struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)arg);
+
+	if (!kvm_pte_valid(*ptep) || level != KVM_PGTABLE_MAX_LEVELS - 1)
+		return -EINVAL;
+
+	slot->addr = addr;
+	slot->ptep = ptep;
+
+	/*
+	 * Clear the PTE, but keep the page-table page refcount elevated to
+	 * prevent it from ever being freed. This lets us manipulate the PTEs
+	 * by hand safely without ever needing to allocate memory.
+	 */
+	fixmap_clear_slot(slot);
+
+	return 0;
+}
+
+static int create_fixmap_slot(u64 addr, u64 cpu)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= __create_fixmap_slot_cb,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.arg = (void *)cpu,
+	};
+
+	return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
+}
+
+int hyp_create_pcpu_fixmap(void)
+{
+	unsigned long addr, i;
+	int ret;
+
+	for (i = 0; i < hyp_nr_cpus; i++) {
+		ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr);
+		if (ret)
+			return ret;
+
+		ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE,
+					  __hyp_pa(__hyp_bss_start), PAGE_HYP);
+		if (ret)
+			return ret;
+
+		ret = create_fixmap_slot(addr, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int hyp_create_idmap(u32 hyp_va_bits)
 {
 	unsigned long start, end;
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 2be72fbe7279..0f69c1393416 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -321,6 +321,10 @@  void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
+	ret = hyp_create_pcpu_fixmap();
+	if (ret)
+		goto out;
+
 	pkvm_hyp_vm_table_init(vm_table_base);
 out:
 	/*
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index a1a27f88a312..2bcb2d5903ba 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -57,8 +57,6 @@  struct kvm_pgtable_walk_data {
 	u64				end;
 };
 
-#define KVM_PHYS_INVALID (-1ULL)
-
 static bool kvm_phys_is_valid(u64 phys)
 {
 	return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
@@ -122,16 +120,6 @@  static bool kvm_pte_table(kvm_pte_t pte, u32 level)
 	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
 }
 
-static kvm_pte_t kvm_phys_to_pte(u64 pa)
-{
-	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
-
-	if (PAGE_SHIFT == 16)
-		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
-
-	return pte;
-}
-
 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
 {
 	return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));