diff mbox series

[v3,10/11] kvm: arm64: Set up hyp percpu data for nVHE

Message ID 20200916173439.32265-11-dbrazdil@google.com (mailing list archive)
State New, archived
Headers show
Series Independent per-CPU data section for nVHE | expand

Commit Message

David Brazdil Sept. 16, 2020, 5:34 p.m. UTC
Add hyp percpu section to linker script and rename the corresponding ELF
sections of hyp/nvhe object files. This moves all nVHE-specific percpu
variables to the new hyp percpu section.

Allocate sufficient amount of memory for all percpu hyp regions at global KVM
init time and create corresponding hyp mappings.

The base addresses of hyp percpu regions are kept in a dynamically allocated
array in the kernel.

Add NULL checks in PMU event-reset code as it may run before KVM memory is
initialized.

Signed-off-by: David Brazdil <dbrazdil@google.com>
---
 arch/arm64/include/asm/kvm_asm.h  | 19 +++++++++--
 arch/arm64/include/asm/sections.h |  1 +
 arch/arm64/kernel/vmlinux.lds.S   |  9 +++++
 arch/arm64/kvm/arm.c              | 56 +++++++++++++++++++++++++++++--
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S |  6 ++++
 arch/arm64/kvm/pmu.c              |  5 ++-
 6 files changed, 90 insertions(+), 6 deletions(-)

Comments

Will Deacon Sept. 18, 2020, 12:22 p.m. UTC | #1
On Wed, Sep 16, 2020 at 06:34:38PM +0100, David Brazdil wrote:
> Add hyp percpu section to linker script and rename the corresponding ELF
> sections of hyp/nvhe object files. This moves all nVHE-specific percpu
> variables to the new hyp percpu section.
> 
> Allocate sufficient amount of memory for all percpu hyp regions at global KVM
> init time and create corresponding hyp mappings.
> 
> The base addresses of hyp percpu regions are kept in a dynamically allocated
> array in the kernel.
> 
> Add NULL checks in PMU event-reset code as it may run before KVM memory is
> initialized.
> 
> Signed-off-by: David Brazdil <dbrazdil@google.com>
> ---
>  arch/arm64/include/asm/kvm_asm.h  | 19 +++++++++--
>  arch/arm64/include/asm/sections.h |  1 +
>  arch/arm64/kernel/vmlinux.lds.S   |  9 +++++
>  arch/arm64/kvm/arm.c              | 56 +++++++++++++++++++++++++++++--
>  arch/arm64/kvm/hyp/nvhe/hyp.lds.S |  6 ++++
>  arch/arm64/kvm/pmu.c              |  5 ++-
>  6 files changed, 90 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> index abc03f386b40..e0e1e404f6eb 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -66,8 +66,23 @@
>  #define CHOOSE_VHE_SYM(sym)	sym
>  #define CHOOSE_NVHE_SYM(sym)	kvm_nvhe_sym(sym)
>  
> -#define this_cpu_ptr_nvhe(sym)		this_cpu_ptr(&kvm_nvhe_sym(sym))
> -#define per_cpu_ptr_nvhe(sym, cpu)	per_cpu_ptr(&kvm_nvhe_sym(sym), cpu)
> +/* Array of percpu base addresses. Length of the array is nr_cpu_ids. */
> +extern unsigned long *kvm_arm_hyp_percpu_base;
> +
> +/*
> + * Compute pointer to a symbol defined in nVHE percpu region.
> + * Returns NULL if percpu memory has not been allocated yet.
> + */
> +#define this_cpu_ptr_nvhe(sym)	per_cpu_ptr_nvhe(sym, smp_processor_id())

Don't you run into similar problems here with the pcpu accessors when
CONFIG_DEBUG_PREEMPT=y? I'm worried we can end up with an unresolved
reference to debug_smp_processor_id().

> +#define per_cpu_ptr_nvhe(sym, cpu)					\
> +	({								\
> +		unsigned long base, off;				\
> +		base = kvm_arm_hyp_percpu_base				\
> +			? kvm_arm_hyp_percpu_base[cpu] : 0;		\
> +		off = (unsigned long)&kvm_nvhe_sym(sym) -		\
> +		      (unsigned long)&kvm_nvhe_sym(__per_cpu_start);	\
> +		base ? (typeof(kvm_nvhe_sym(sym))*)(base + off) : NULL;	\
> +	})
>  
>  #ifndef __KVM_NVHE_HYPERVISOR__
>  /*
> diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
> index 3994169985ef..5062553a6847 100644
> --- a/arch/arm64/include/asm/sections.h
> +++ b/arch/arm64/include/asm/sections.h
> @@ -18,5 +18,6 @@ extern char __exittext_begin[], __exittext_end[];
>  extern char __irqentry_text_start[], __irqentry_text_end[];
>  extern char __mmuoff_data_start[], __mmuoff_data_end[];
>  extern char __entry_tramp_text_start[], __entry_tramp_text_end[];
> +extern char __kvm_nvhe___per_cpu_start[], __kvm_nvhe___per_cpu_end[];
>  
>  #endif /* __ASM_SECTIONS_H */
> diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
> index 0d543570e811..d52e6b5dbfd3 100644
> --- a/arch/arm64/kernel/vmlinux.lds.S
> +++ b/arch/arm64/kernel/vmlinux.lds.S
> @@ -9,6 +9,7 @@
>  
>  #include <asm-generic/vmlinux.lds.h>
>  #include <asm/cache.h>
> +#include <asm/hyp_image.h>
>  #include <asm/kernel-pgtable.h>
>  #include <asm/memory.h>
>  #include <asm/page.h>
> @@ -27,8 +28,15 @@ jiffies = jiffies_64;
>  	__start___kvm_ex_table = .;				\
>  	*(__kvm_ex_table)					\
>  	__stop___kvm_ex_table = .;
> +
> +#define HYPERVISOR_PERCPU_SECTION				\
> +	. = ALIGN(PAGE_SIZE);					\
> +	HYP_SECTION_NAME(.data..percpu) : {			\
> +		*(HYP_SECTION_NAME(.data..percpu))		\
> +	}
>  #else /* CONFIG_KVM */
>  #define HYPERVISOR_EXTABLE
> +#define HYPERVISOR_PERCPU_SECTION
>  #endif
>  
>  #define HYPERVISOR_TEXT					\
> @@ -194,6 +202,7 @@ SECTIONS
>  	}
>  
>  	PERCPU_SECTION(L1_CACHE_BYTES)
> +	HYPERVISOR_PERCPU_SECTION
>  
>  	.rela.dyn : ALIGN(8) {
>  		*(.rela .rela*)
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 58d7d614519b..8293319a32e7 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -47,6 +47,7 @@ __asm__(".arch_extension	virt");
>  #endif
>  
>  static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
> +unsigned long *kvm_arm_hyp_percpu_base;
>  
>  /* The VMID used in the VTTBR */
>  static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
> @@ -1258,6 +1259,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  	}
>  }
>  
> +#define kvm_hyp_percpu_base(cpu)	((unsigned long)per_cpu_ptr_nvhe(__per_cpu_start, cpu))

Having both kvm_arm_hyp_percpu_base and kvm_hyp_percpu_base be so
completely different is a recipe for disaster! Please can you rename
one/both of them to make it clearer what they represent?

> diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
> index 6d80ffe1ebfc..cd653091f213 100644
> --- a/arch/arm64/kvm/pmu.c
> +++ b/arch/arm64/kvm/pmu.c
> @@ -33,7 +33,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
>  {
>  	struct kvm_host_data *ctx = this_cpu_ptr_hyp(kvm_host_data);
>  
> -	if (!kvm_pmu_switch_needed(attr))
> +	if (!ctx || !kvm_pmu_switch_needed(attr))
>  		return;
>  
>  	if (!attr->exclude_host)
> @@ -49,6 +49,9 @@ void kvm_clr_pmu_events(u32 clr)
>  {
>  	struct kvm_host_data *ctx = this_cpu_ptr_hyp(kvm_host_data);
>  
> +	if (!ctx)
> +		return;

I guess this should only happen if KVM failed to initialise or something,
right? (e.g. if we were booted at EL1). If so, I'm wondering whether it
would be better to do something like:

	if (!is_hyp_mode_available())
		return;

	WARN_ON_ONCE(!ctx);

so that any unexpected NULL pointer there screams loudly, rather than causes
the register switch to be silently ignored. What do you think?

Will
David Brazdil Sept. 22, 2020, 6:34 p.m. UTC | #2
> > -#define this_cpu_ptr_nvhe(sym)		this_cpu_ptr(&kvm_nvhe_sym(sym))
> > -#define per_cpu_ptr_nvhe(sym, cpu)	per_cpu_ptr(&kvm_nvhe_sym(sym), cpu)
> > +/* Array of percpu base addresses. Length of the array is nr_cpu_ids. */
> > +extern unsigned long *kvm_arm_hyp_percpu_base;
> > +
> > +/*
> > + * Compute pointer to a symbol defined in nVHE percpu region.
> > + * Returns NULL if percpu memory has not been allocated yet.
> > + */
> > +#define this_cpu_ptr_nvhe(sym)	per_cpu_ptr_nvhe(sym, smp_processor_id())
> 
> Don't you run into similar problems here with the pcpu accessors when
> CONFIG_DEBUG_PREEMPT=y? I'm worried we can end up with an unresolved
> reference to debug_smp_processor_id().

Fortunately not. This now doesn't use the generic macros at all.

> >  /* The VMID used in the VTTBR */
> >  static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
> > @@ -1258,6 +1259,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >  	}
> >  }
> >  
> > +#define kvm_hyp_percpu_base(cpu)	((unsigned long)per_cpu_ptr_nvhe(__per_cpu_start, cpu))
> 
> Having both kvm_arm_hyp_percpu_base and kvm_hyp_percpu_base be so
> completely different is a recipe for disaster! Please can you rename
> one/both of them to make it clearer what they represent?

I am heavily simplifying this code in v4. Got rid of this macro completely, so
hopefully there will be no confusion.

> > -	if (!kvm_pmu_switch_needed(attr))
> > +	if (!ctx || !kvm_pmu_switch_needed(attr))
> >  		return;
> >  
> >  	if (!attr->exclude_host)
> > @@ -49,6 +49,9 @@ void kvm_clr_pmu_events(u32 clr)
> >  {
> >  	struct kvm_host_data *ctx = this_cpu_ptr_hyp(kvm_host_data);
> >  
> > +	if (!ctx)
> > +		return;
> 
> I guess this should only happen if KVM failed to initialise or something,
> right? (e.g. if we were booted at EL1). If so, I'm wondering whether it
> would be better to do something like:
> 
> 	if (!is_hyp_mode_available())
> 		return;
> 
> 	WARN_ON_ONCE(!ctx);
> 
> so that any unexpected NULL pointer there screams loudly, rather than causes
> the register switch to be silently ignored. What do you think?

Unfortunately, this happens on every boot. I don't fully understand how the
boot order is determined, so please correct me if this makes no sense, but 
kvm_clr_pmu_events is called as part of CPUHP_AP_PERF_ARM_STARTING. The first
time that happens is before KVM initialized (tested from inserting
BUG_ON(!ctx)). That's not a problem, the per-CPU memory is there and it's all
zeroes. It becomes a problem with this patch because the per-CPU memory is not
there *yet*.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index abc03f386b40..e0e1e404f6eb 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -66,8 +66,23 @@ 
 #define CHOOSE_VHE_SYM(sym)	sym
 #define CHOOSE_NVHE_SYM(sym)	kvm_nvhe_sym(sym)
 
-#define this_cpu_ptr_nvhe(sym)		this_cpu_ptr(&kvm_nvhe_sym(sym))
-#define per_cpu_ptr_nvhe(sym, cpu)	per_cpu_ptr(&kvm_nvhe_sym(sym), cpu)
+/* Array of percpu base addresses. Length of the array is nr_cpu_ids. */
+extern unsigned long *kvm_arm_hyp_percpu_base;
+
+/*
+ * Compute pointer to a symbol defined in nVHE percpu region.
+ * Returns NULL if percpu memory has not been allocated yet.
+ */
+#define this_cpu_ptr_nvhe(sym)	per_cpu_ptr_nvhe(sym, smp_processor_id())
+#define per_cpu_ptr_nvhe(sym, cpu)					\
+	({								\
+		unsigned long base, off;				\
+		base = kvm_arm_hyp_percpu_base				\
+			? kvm_arm_hyp_percpu_base[cpu] : 0;		\
+		off = (unsigned long)&kvm_nvhe_sym(sym) -		\
+		      (unsigned long)&kvm_nvhe_sym(__per_cpu_start);	\
+		base ? (typeof(kvm_nvhe_sym(sym))*)(base + off) : NULL;	\
+	})
 
 #ifndef __KVM_NVHE_HYPERVISOR__
 /*
diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
index 3994169985ef..5062553a6847 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -18,5 +18,6 @@  extern char __exittext_begin[], __exittext_end[];
 extern char __irqentry_text_start[], __irqentry_text_end[];
 extern char __mmuoff_data_start[], __mmuoff_data_end[];
 extern char __entry_tramp_text_start[], __entry_tramp_text_end[];
+extern char __kvm_nvhe___per_cpu_start[], __kvm_nvhe___per_cpu_end[];
 
 #endif /* __ASM_SECTIONS_H */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 0d543570e811..d52e6b5dbfd3 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -9,6 +9,7 @@ 
 
 #include <asm-generic/vmlinux.lds.h>
 #include <asm/cache.h>
+#include <asm/hyp_image.h>
 #include <asm/kernel-pgtable.h>
 #include <asm/memory.h>
 #include <asm/page.h>
@@ -27,8 +28,15 @@  jiffies = jiffies_64;
 	__start___kvm_ex_table = .;				\
 	*(__kvm_ex_table)					\
 	__stop___kvm_ex_table = .;
+
+#define HYPERVISOR_PERCPU_SECTION				\
+	. = ALIGN(PAGE_SIZE);					\
+	HYP_SECTION_NAME(.data..percpu) : {			\
+		*(HYP_SECTION_NAME(.data..percpu))		\
+	}
 #else /* CONFIG_KVM */
 #define HYPERVISOR_EXTABLE
+#define HYPERVISOR_PERCPU_SECTION
 #endif
 
 #define HYPERVISOR_TEXT					\
@@ -194,6 +202,7 @@  SECTIONS
 	}
 
 	PERCPU_SECTION(L1_CACHE_BYTES)
+	HYPERVISOR_PERCPU_SECTION
 
 	.rela.dyn : ALIGN(8) {
 		*(.rela .rela*)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 58d7d614519b..8293319a32e7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -47,6 +47,7 @@  __asm__(".arch_extension	virt");
 #endif
 
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+unsigned long *kvm_arm_hyp_percpu_base;
 
 /* The VMID used in the VTTBR */
 static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
@@ -1258,6 +1259,15 @@  long kvm_arch_vm_ioctl(struct file *filp,
 	}
 }
 
+#define kvm_hyp_percpu_base(cpu)	((unsigned long)per_cpu_ptr_nvhe(__per_cpu_start, cpu))
+#define kvm_hyp_percpu_array_size	(nr_cpu_ids * sizeof(*kvm_arm_hyp_percpu_base))
+#define kvm_hyp_percpu_array_order	(get_order(kvm_hyp_percpu_array_size))
+#define kvm_hyp_percpu_start		(CHOOSE_NVHE_SYM(__per_cpu_start))
+#define kvm_hyp_percpu_end		(CHOOSE_NVHE_SYM(__per_cpu_end))
+#define kvm_hyp_percpu_size		((unsigned long)kvm_hyp_percpu_end - \
+					 (unsigned long)kvm_hyp_percpu_start)
+#define kvm_hyp_percpu_order		(kvm_hyp_percpu_size ? get_order(kvm_hyp_percpu_size) : 0)
+
 static void cpu_init_hyp_mode(void)
 {
 	phys_addr_t pgd_ptr;
@@ -1273,8 +1283,8 @@  static void cpu_init_hyp_mode(void)
 	 * kernel's mapping to the linear mapping, and store it in tpidr_el2
 	 * so that we can use adr_l to access per-cpu variables in EL2.
 	 */
-	tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
-		     (unsigned long)kvm_ksym_ref(&kvm_host_data));
+	tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe(__per_cpu_start) -
+		    (unsigned long)kvm_ksym_ref(kvm_hyp_percpu_start);
 
 	pgd_ptr = kvm_mmu_get_httbr();
 	hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
@@ -1506,8 +1516,11 @@  static void teardown_hyp_mode(void)
 	int cpu;
 
 	free_hyp_pgds();
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
 		free_hyp_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+		free_hyp_pages(kvm_hyp_percpu_base(cpu), kvm_hyp_percpu_order);
+	}
+	free_hyp_pages((unsigned long)kvm_arm_hyp_percpu_base, kvm_hyp_percpu_array_order);
 }
 
 /**
@@ -1540,6 +1553,28 @@  static int init_hyp_mode(void)
 		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
 	}
 
+	/*
+	 * Allocate and initialize pages for Hypervisor-mode percpu regions.
+	 */
+	kvm_arm_hyp_percpu_base = (unsigned long *)alloc_hyp_pages(
+			GFP_KERNEL | __GFP_ZERO, kvm_hyp_percpu_array_order);
+	if (!kvm_arm_hyp_percpu_base) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+	for_each_possible_cpu(cpu) {
+		unsigned long percpu_base;
+
+		percpu_base = alloc_hyp_pages(GFP_KERNEL, kvm_hyp_percpu_order);
+		if (!percpu_base) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+
+		memcpy((void *)percpu_base, kvm_hyp_percpu_start, kvm_hyp_percpu_size);
+		kvm_arm_hyp_percpu_base[cpu] = percpu_base;
+	}
+
 	/*
 	 * Map the Hyp-code called directly from the host
 	 */
@@ -1584,6 +1619,21 @@  static int init_hyp_mode(void)
 		}
 	}
 
+	/*
+	 * Map Hyp percpu pages
+	 */
+	for_each_possible_cpu(cpu) {
+		char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
+		char *percpu_end = percpu_begin + PAGE_ALIGN(kvm_hyp_percpu_size);
+
+		err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
+
+		if (err) {
+			kvm_err("Cannot map hyp percpu region\n");
+			goto out_err;
+		}
+	}
+
 	for_each_possible_cpu(cpu) {
 		kvm_host_data_t *cpu_data;
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index 3b13d1c7cd1a..bb2d986ff696 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -7,7 +7,13 @@ 
  */
 
 #include <asm/hyp_image.h>
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/memory.h>
 
 SECTIONS {
 	HYP_SECTION(.text)
+	HYP_SECTION_NAME(.data..percpu) : {
+		PERCPU_INPUT(L1_CACHE_BYTES)
+	}
 }
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 6d80ffe1ebfc..cd653091f213 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -33,7 +33,7 @@  void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
 {
 	struct kvm_host_data *ctx = this_cpu_ptr_hyp(kvm_host_data);
 
-	if (!kvm_pmu_switch_needed(attr))
+	if (!ctx || !kvm_pmu_switch_needed(attr))
 		return;
 
 	if (!attr->exclude_host)
@@ -49,6 +49,9 @@  void kvm_clr_pmu_events(u32 clr)
 {
 	struct kvm_host_data *ctx = this_cpu_ptr_hyp(kvm_host_data);
 
+	if (!ctx)
+		return;
+
 	ctx->pmu_events.events_host &= ~clr;
 	ctx->pmu_events.events_guest &= ~clr;
 }