diff mbox series

[15/22] x86/idle: allow using a per-pCPU L4

Message ID 20240726152206.28411-16-roger.pau@citrix.com (mailing list archive)
State New
Headers show
Series x86: adventures in Address Space Isolation | expand

Commit Message

Roger Pau Monné July 26, 2024, 3:21 p.m. UTC
Introduce support for possibly using a different L4 across the idle vCPUs.

This change only introduces support for loading a per-pPCU idle L4, but even
with the per-CPU idle page-table enabled it should still be a clone of
idle_pg_table, hence no functional change expected.

Note the idle L4 is not changed after Xen has reached the SYS_STATE_smp_boot
state, hence there are no need to synchronize the contents of the L4 once the
CPUs are started.

Using a per-CPU idle page-table is not strictly required for the Address Space
Isolation work, as idle page tables are never used when running guests.
However it simplifies memory management of the per-CPU mappings, as creating
per-CPU mappings only require using the idle page-table of the CPU where the
mappings should be created.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/boot/x86_64.S       | 11 +++++++++++
 xen/arch/x86/domain.c            | 20 +++++++++++++++++++-
 xen/arch/x86/domain_page.c       |  2 +-
 xen/arch/x86/include/asm/setup.h |  1 +
 xen/arch/x86/setup.c             |  3 +++
 xen/arch/x86/smpboot.c           |  7 +++++++
 6 files changed, 42 insertions(+), 2 deletions(-)

Comments

Alejandro Vallejo Aug. 21, 2024, 4:42 p.m. UTC | #1
On Fri Jul 26, 2024 at 4:21 PM BST, Roger Pau Monne wrote:
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index 9cfcf0dc63f3..b62c4311da6c 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -555,6 +555,7 @@ void arch_vcpu_regs_init(struct vcpu *v)
>  int arch_vcpu_create(struct vcpu *v)
>  {
>      struct domain *d = v->domain;
> +    root_pgentry_t *pgt = NULL;
>      int rc;
>  
>      v->arch.flags = TF_kernel_mode;
> @@ -589,7 +590,23 @@ int arch_vcpu_create(struct vcpu *v)
>      else
>      {
>          /* Idle domain */
> -        v->arch.cr3 = __pa(idle_pg_table);
> +        if ( (opt_asi_pv || opt_asi_hvm) && v->vcpu_id )
> +        {
> +            pgt = alloc_xenheap_page();
> +
> +            /*
> +             * For the idle vCPU 0 (the BSP idle vCPU) use idle_pg_table
> +             * directly, there's no need to create yet another copy.
> +             */

Shouldn't this comment be in the else branch instead? Or reworded to refer to
non-0 vCPUs.

> +            rc = -ENOMEM;

While it's true rc is overriden later, I feel uneasy leaving it with -ENOMEM
after the check. Could we have it immediately before "goto fail"?

> +            if ( !pgt )
> +                goto fail;
> +
> +            copy_page(pgt, idle_pg_table);
> +            v->arch.cr3 = __pa(pgt);
> +        }
> +        else
> +            v->arch.cr3 = __pa(idle_pg_table);
>          rc = 0;
>          v->arch.msrs = ZERO_BLOCK_PTR; /* Catch stray misuses */
>      }
> @@ -611,6 +628,7 @@ int arch_vcpu_create(struct vcpu *v)
>      vcpu_destroy_fpu(v);
>      xfree(v->arch.msrs);
>      v->arch.msrs = NULL;
> +    free_xenheap_page(pgt);
>  
>      return rc;
>  }

I guess the idle domain has a forever lifetime and its vCPUs are kept around
forever too, right?; otherwise we'd need extra logic in the the vcpu_destroy()
to free the page table copies should they exist too.

Cheers,
Alejandro
diff mbox series

Patch

diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S
index 04bb62ae8680..af7854820185 100644
--- a/xen/arch/x86/boot/x86_64.S
+++ b/xen/arch/x86/boot/x86_64.S
@@ -15,6 +15,17 @@  ENTRY(__high_start)
         mov     $XEN_MINIMAL_CR4,%rcx
         mov     %rcx,%cr4
 
+        /*
+         * Possibly switch to the per-CPU idle page-tables. Note we cannot
+         * switch earlier as the per-CPU page-tables might be above 4G, and
+         * hence need to load them from 64bit code.
+         */
+        mov     ap_cr3(%rip), %rax
+        test    %rax, %rax
+        jz      .L_skip_cr3
+        mov     %rax, %cr3
+.L_skip_cr3:
+
         mov     stack_start(%rip),%rsp
 
         /* Reset EFLAGS (subsumes CLI and CLD). */
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 9cfcf0dc63f3..b62c4311da6c 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -555,6 +555,7 @@  void arch_vcpu_regs_init(struct vcpu *v)
 int arch_vcpu_create(struct vcpu *v)
 {
     struct domain *d = v->domain;
+    root_pgentry_t *pgt = NULL;
     int rc;
 
     v->arch.flags = TF_kernel_mode;
@@ -589,7 +590,23 @@  int arch_vcpu_create(struct vcpu *v)
     else
     {
         /* Idle domain */
-        v->arch.cr3 = __pa(idle_pg_table);
+        if ( (opt_asi_pv || opt_asi_hvm) && v->vcpu_id )
+        {
+            pgt = alloc_xenheap_page();
+
+            /*
+             * For the idle vCPU 0 (the BSP idle vCPU) use idle_pg_table
+             * directly, there's no need to create yet another copy.
+             */
+            rc = -ENOMEM;
+            if ( !pgt )
+                goto fail;
+
+            copy_page(pgt, idle_pg_table);
+            v->arch.cr3 = __pa(pgt);
+        }
+        else
+            v->arch.cr3 = __pa(idle_pg_table);
         rc = 0;
         v->arch.msrs = ZERO_BLOCK_PTR; /* Catch stray misuses */
     }
@@ -611,6 +628,7 @@  int arch_vcpu_create(struct vcpu *v)
     vcpu_destroy_fpu(v);
     xfree(v->arch.msrs);
     v->arch.msrs = NULL;
+    free_xenheap_page(pgt);
 
     return rc;
 }
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index eac5e3304fb8..99b78af90fd3 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -51,7 +51,7 @@  static inline struct vcpu *mapcache_current_vcpu(void)
         if ( (v = idle_vcpu[smp_processor_id()]) == current )
             sync_local_execstate();
         /* We must now be running on the idle page table. */
-        ASSERT(cr3_pa(read_cr3()) == __pa(idle_pg_table));
+        ASSERT(cr3_pa(read_cr3()) == cr3_pa(v->arch.cr3));
     }
 
     return v;
diff --git a/xen/arch/x86/include/asm/setup.h b/xen/arch/x86/include/asm/setup.h
index d75589178b91..a8452fce8f05 100644
--- a/xen/arch/x86/include/asm/setup.h
+++ b/xen/arch/x86/include/asm/setup.h
@@ -14,6 +14,7 @@  extern unsigned long xenheap_initial_phys_start;
 extern uint64_t boot_tsc_stamp;
 
 extern void *stack_start;
+extern unsigned long ap_cr3;
 
 void early_cpu_init(bool verbose);
 void early_time_init(void);
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index bc387d96b519..c5a13b30daf4 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -158,6 +158,9 @@  char asmlinkage __section(".init.bss.stack_aligned") __aligned(STACK_SIZE)
 /* Used by the BSP/AP paths to find the higher half stack mapping to use. */
 void *stack_start = cpu0_stack + STACK_SIZE - sizeof(struct cpu_info);
 
+/* cr3 value for the AP to load on boot. */
+unsigned long ap_cr3;
+
 /* Used by the boot asm to stash the relocated multiboot info pointer. */
 unsigned int asmlinkage __initdata multiboot_ptr;
 
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 8aa621533f3d..e07add36b1b6 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -581,6 +581,13 @@  static int do_boot_cpu(int apicid, int cpu)
 
     stack_start = stack_base[cpu] + STACK_SIZE - sizeof(struct cpu_info);
 
+    /*
+     * If per-CPU idle root page table has been allocated, switch to it as
+     * part of the AP bringup trampoline.
+     */
+    ap_cr3 = idle_vcpu[cpu]->arch.cr3 != __pa(idle_pg_table) ?
+             idle_vcpu[cpu]->arch.cr3 : 0;
+
     /* This grunge runs the startup process for the targeted processor. */
 
     set_cpu_state(CPU_STATE_INIT);