diff mbox series

[v2,(resend),07/27] x86: Map/unmap pages in restore_all_guests

Message ID 20240116192611.41112-8-eliasely@amazon.com (mailing list archive)
State New
Headers show
Series Remove the directmap | expand

Commit Message

Elias El Yandouzi Jan. 16, 2024, 7:25 p.m. UTC
From: Hongyan Xia <hongyxia@amazon.com>

Before, it assumed the pv cr3 could be accessed via a direct map. This
is no longer true.

Note that we do not map and unmap root_pgt for now since it is still a
xenheap page.

Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>

----

    Changes in V2:
        * Rework the shadow perdomain mapping solution in the follow-up patches

    Changes since Hongyan's version:
        * Remove the final dot in the commit title

Comments

Jan Beulich Feb. 20, 2024, 9:51 a.m. UTC | #1
On 16.01.2024 20:25, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
> 
> Before, it assumed the pv cr3 could be accessed via a direct map. This
> is no longer true.

There are a number of terminology issues here, starting with the title:
Unlike (iirc) in an earlier version, no mapping/unmapping occurs in
restore_all_guests itself anymore. When reading just the title I
thought "What? Didn't I say no there?" Then for the sentence above: If
what it stated was true, a bug would have been introduced in one of
the earlier patches. What I think is meant, though, is that this is
going to be no longer true.

Finally the use of "shadow" in identifiers in the code changes
themselves is somewhat problematic imo, seeing the shadow paging mode
we support for both HVM and PV. The nature here is different (it's
merely a secondary mapping aiui), so I think it would be better to
avoid the term "shadow". Maybe ...

> --- a/xen/arch/x86/include/asm/config.h
> +++ b/xen/arch/x86/include/asm/config.h
> @@ -202,7 +202,7 @@ extern unsigned char boot_edid_info[128];
>  /* Slot 260: per-domain mappings (including map cache). */
>  #define PERDOMAIN_VIRT_START    (PML4_ADDR(260))
>  #define PERDOMAIN_SLOT_MBYTES   (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
> -#define PERDOMAIN_SLOTS         3
> +#define PERDOMAIN_SLOTS         4
>  #define PERDOMAIN_VIRT_SLOT(s)  (PERDOMAIN_VIRT_START + (s) * \
>                                   (PERDOMAIN_SLOT_MBYTES << 20))
>  /* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
> @@ -316,6 +316,16 @@ extern unsigned long xen_phys_start;
>  #define ARG_XLAT_START(v)        \
>      (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
>  
> +/* root_pt shadow mapping area. The fourth per-domain-mapping sub-area */
> +#define SHADOW_ROOT_PT_VIRT_START   PERDOMAIN_VIRT_SLOT(3)
> +#define SHADOW_ROOT_PT_ENTRIES      MAX_VIRT_CPUS
> +#define SHADOW_ROOT_PT_VIRT_END     (SHADOW_ROOT_PT_VIRT_START +    \
> +                                     (SHADOW_ROOT_PT_ENTRIES * PAGE_SIZE))
> +
> +/* The address of a particular VCPU's ROOT_PT */
> +#define SHADOW_ROOT_PT_VCPU_VIRT_START(v) \
> +    (SHADOW_ROOT_PT_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))

... ROOT_PT_MAPPING_* throughout, or PV_ROOT_PT_MAPPING_*?

As to SHADOW_ROOT_PT_VIRT_END - when trying to check where it's used
and hence whether it really needs to use MAX_VIRT_CPUS I couldn't
spot any use. I don't think the constant should be defined when it's
not needed.

> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -505,6 +505,13 @@ void share_xen_page_with_guest(struct page_info *page, struct domain *d,
>      spin_unlock(&d->page_alloc_lock);
>  }
>  
> +#define shadow_root_pt_idx(v) \
> +    ((v)->vcpu_id >> PAGETABLE_ORDER)
> +
> +#define pv_shadow_root_pt_pte(v) \
> +    ((v)->domain->arch.pv.shadow_root_pt_l1tab[shadow_root_pt_idx(v)] + \
> +     ((v)->vcpu_id & (L1_PAGETABLE_ENTRIES - 1)))

I think uniformly named constant want using in both macros, i.e. either
some L1_* in the first macro (preferred) or an expression derived from
PAGETABLE_ORDER in the 2nd.

> @@ -524,6 +531,13 @@ void write_ptbase(struct vcpu *v)
>  
>      if ( is_pv_vcpu(v) && v->domain->arch.pv.xpti )
>      {
> +        mfn_t guest_root_pt = _mfn(v->arch.cr3 >> PAGE_SHIFT);

While we do so in several other places, I think we'd be better off not
continuing to assume the top bits to all be zero. IOW MASK_EXTR() may
be better to use here.

> +        l1_pgentry_t *pte = pv_shadow_root_pt_pte(v);
> +
> +        ASSERT(v == current);
> +
> +        l1e_write(pte, l1e_from_mfn(guest_root_pt, __PAGE_HYPERVISOR_RW));

The mapping is the copy source in restore_all_guests, isn't it? In
which case couldn't this be a r/o mapping?

> --- a/xen/arch/x86/pv/domain.c
> +++ b/xen/arch/x86/pv/domain.c
> @@ -288,6 +288,19 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
>                                1U << GDT_LDT_VCPU_SHIFT);
>  }
>  
> +static int pv_create_shadow_root_pt_l1tab(struct vcpu *v)
> +{
> +    return create_perdomain_mapping(v->domain, SHADOW_ROOT_PT_VCPU_VIRT_START(v),

This line looks to be too long. But ...

> +                                    1, v->domain->arch.pv.shadow_root_pt_l1tab,
> +                                    NULL);
> +}
> +
> +static void pv_destroy_shadow_root_pt_l1tab(struct vcpu *v)
> +
> +{
> +    destroy_perdomain_mapping(v->domain, SHADOW_ROOT_PT_VCPU_VIRT_START(v), 1);
> +}

... I'm not convinced of the usefulness of these wrapper functions
anyway, even more so that each is used exactly once.

> @@ -297,6 +310,7 @@ void pv_vcpu_destroy(struct vcpu *v)
>      }
>  
>      pv_destroy_gdt_ldt_l1tab(v);
> +    pv_destroy_shadow_root_pt_l1tab(v);
>      XFREE(v->arch.pv.trap_ctxt);
>  }
>  
> @@ -311,6 +325,13 @@ int pv_vcpu_initialise(struct vcpu *v)
>      if ( rc )
>          return rc;
>  
> +    if ( v->domain->arch.pv.xpti )
> +    {
> +        rc = pv_create_shadow_root_pt_l1tab(v);
> +        if ( rc )
> +            goto done;
> +    }
> +
>      BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
>                   PAGE_SIZE);
>      v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
> @@ -346,10 +367,12 @@ void pv_domain_destroy(struct domain *d)
>  
>      destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
>                                GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
> +    destroy_perdomain_mapping(d, SHADOW_ROOT_PT_VIRT_START, SHADOW_ROOT_PT_ENTRIES);

Too long line again.

>      XFREE(d->arch.pv.cpuidmasks);
>  
>      FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
> +    FREE_XENHEAP_PAGE(d->arch.pv.shadow_root_pt_l1tab);
>  }
>  
>  void noreturn cf_check continue_pv_domain(void);
> @@ -371,6 +394,12 @@ int pv_domain_initialise(struct domain *d)
>          goto fail;
>      clear_page(d->arch.pv.gdt_ldt_l1tab);
>  
> +    d->arch.pv.shadow_root_pt_l1tab =
> +        alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
> +    if ( !d->arch.pv.shadow_root_pt_l1tab )
> +        goto fail;
> +    clear_page(d->arch.pv.shadow_root_pt_l1tab);

Looks like you simply cloned the GDT/LDT code. That's covering 128k
of VA space per vCPU, though, while here you'd using only 4k. Hence
using a full page looks like a factor 32 over-allocation. And once
using xzalloc() here instead a further question would be whether to
limit to the domain's actual needs - most domains will have far less
than 8k vCPU-s. In the common case (up to 512 vCPU-s) a single slot
will suffice, at which point a yet further question would be whether
to embed the "array" in struct pv_domain instead in that common case
(e.g. by using a union).

> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -165,7 +165,15 @@ restore_all_guest:
>          and   %rsi, %rdi
>          and   %r9, %rsi
>          add   %rcx, %rdi
> +
> +        /*
> +         * The address in the vCPU cr3 is always mapped in the shadow
> +         * root_pt virt area.
> +         */
> +        imul $PAGE_SIZE, VCPU_id(%rbx), %esi

Nit: Another blank please after the insn mnemonic, to match what's
upwards in context as well as ...

> +        movabs $SHADOW_ROOT_PT_VIRT_START, %rcx
>          add   %rcx, %rsi

... this.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/include/asm/config.h b/xen/arch/x86/include/asm/config.h
index bbced338be..7cf1f33dc0 100644
--- a/xen/arch/x86/include/asm/config.h
+++ b/xen/arch/x86/include/asm/config.h
@@ -202,7 +202,7 @@  extern unsigned char boot_edid_info[128];
 /* Slot 260: per-domain mappings (including map cache). */
 #define PERDOMAIN_VIRT_START    (PML4_ADDR(260))
 #define PERDOMAIN_SLOT_MBYTES   (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
-#define PERDOMAIN_SLOTS         3
+#define PERDOMAIN_SLOTS         4
 #define PERDOMAIN_VIRT_SLOT(s)  (PERDOMAIN_VIRT_START + (s) * \
                                  (PERDOMAIN_SLOT_MBYTES << 20))
 /* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
@@ -316,6 +316,16 @@  extern unsigned long xen_phys_start;
 #define ARG_XLAT_START(v)        \
     (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
 
+/* root_pt shadow mapping area. The fourth per-domain-mapping sub-area */
+#define SHADOW_ROOT_PT_VIRT_START   PERDOMAIN_VIRT_SLOT(3)
+#define SHADOW_ROOT_PT_ENTRIES      MAX_VIRT_CPUS
+#define SHADOW_ROOT_PT_VIRT_END     (SHADOW_ROOT_PT_VIRT_START +    \
+                                     (SHADOW_ROOT_PT_ENTRIES * PAGE_SIZE))
+
+/* The address of a particular VCPU's ROOT_PT */
+#define SHADOW_ROOT_PT_VCPU_VIRT_START(v) \
+    (SHADOW_ROOT_PT_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
+
 #define ELFSIZE 64
 
 #define ARCH_CRASH_SAVE_VMCOREINFO
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 622d22bef2..4d97c68028 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -273,6 +273,7 @@  struct time_scale {
 struct pv_domain
 {
     l1_pgentry_t **gdt_ldt_l1tab;
+    l1_pgentry_t **shadow_root_pt_l1tab;
 
     atomic_t nr_l4_pages;
 
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index b56e0d8065..a72c32d87c 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -505,6 +505,13 @@  void share_xen_page_with_guest(struct page_info *page, struct domain *d,
     spin_unlock(&d->page_alloc_lock);
 }
 
+#define shadow_root_pt_idx(v) \
+    ((v)->vcpu_id >> PAGETABLE_ORDER)
+
+#define pv_shadow_root_pt_pte(v) \
+    ((v)->domain->arch.pv.shadow_root_pt_l1tab[shadow_root_pt_idx(v)] + \
+     ((v)->vcpu_id & (L1_PAGETABLE_ENTRIES - 1)))
+
 void make_cr3(struct vcpu *v, mfn_t mfn)
 {
     struct domain *d = v->domain;
@@ -524,6 +531,13 @@  void write_ptbase(struct vcpu *v)
 
     if ( is_pv_vcpu(v) && v->domain->arch.pv.xpti )
     {
+        mfn_t guest_root_pt = _mfn(v->arch.cr3 >> PAGE_SHIFT);
+        l1_pgentry_t *pte = pv_shadow_root_pt_pte(v);
+
+        ASSERT(v == current);
+
+        l1e_write(pte, l1e_from_mfn(guest_root_pt, __PAGE_HYPERVISOR_RW));
+
         cpu_info->root_pgt_changed = true;
         cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
         if ( new_cr4 & X86_CR4_PCIDE )
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 2a445bb17b..fef9ae2352 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -288,6 +288,19 @@  static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
                               1U << GDT_LDT_VCPU_SHIFT);
 }
 
+static int pv_create_shadow_root_pt_l1tab(struct vcpu *v)
+{
+    return create_perdomain_mapping(v->domain, SHADOW_ROOT_PT_VCPU_VIRT_START(v),
+                                    1, v->domain->arch.pv.shadow_root_pt_l1tab,
+                                    NULL);
+}
+
+static void pv_destroy_shadow_root_pt_l1tab(struct vcpu *v)
+
+{
+    destroy_perdomain_mapping(v->domain, SHADOW_ROOT_PT_VCPU_VIRT_START(v), 1);
+}
+
 void pv_vcpu_destroy(struct vcpu *v)
 {
     if ( is_pv_32bit_vcpu(v) )
@@ -297,6 +310,7 @@  void pv_vcpu_destroy(struct vcpu *v)
     }
 
     pv_destroy_gdt_ldt_l1tab(v);
+    pv_destroy_shadow_root_pt_l1tab(v);
     XFREE(v->arch.pv.trap_ctxt);
 }
 
@@ -311,6 +325,13 @@  int pv_vcpu_initialise(struct vcpu *v)
     if ( rc )
         return rc;
 
+    if ( v->domain->arch.pv.xpti )
+    {
+        rc = pv_create_shadow_root_pt_l1tab(v);
+        if ( rc )
+            goto done;
+    }
+
     BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
                  PAGE_SIZE);
     v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
@@ -346,10 +367,12 @@  void pv_domain_destroy(struct domain *d)
 
     destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
                               GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
+    destroy_perdomain_mapping(d, SHADOW_ROOT_PT_VIRT_START, SHADOW_ROOT_PT_ENTRIES);
 
     XFREE(d->arch.pv.cpuidmasks);
 
     FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
+    FREE_XENHEAP_PAGE(d->arch.pv.shadow_root_pt_l1tab);
 }
 
 void noreturn cf_check continue_pv_domain(void);
@@ -371,6 +394,12 @@  int pv_domain_initialise(struct domain *d)
         goto fail;
     clear_page(d->arch.pv.gdt_ldt_l1tab);
 
+    d->arch.pv.shadow_root_pt_l1tab =
+        alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
+    if ( !d->arch.pv.shadow_root_pt_l1tab )
+        goto fail;
+    clear_page(d->arch.pv.shadow_root_pt_l1tab);
+
     if ( levelling_caps & ~LCAP_faulting &&
          (d->arch.pv.cpuidmasks = xmemdup(&cpuidmask_defaults)) == NULL )
         goto fail;
@@ -381,6 +410,11 @@  int pv_domain_initialise(struct domain *d)
     if ( rc )
         goto fail;
 
+    rc = create_perdomain_mapping(d, SHADOW_ROOT_PT_VIRT_START,
+                                  SHADOW_ROOT_PT_ENTRIES, NULL, NULL);
+    if ( rc )
+        goto fail;
+
     d->arch.ctxt_switch = &pv_csw;
 
     d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
index 57b73a4e62..23f9cca1a2 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -51,6 +51,7 @@  void __dummy__(void)
     OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es);
     BLANK();
 
+    OFFSET(VCPU_id, struct vcpu, vcpu_id);
     OFFSET(VCPU_processor, struct vcpu, processor);
     OFFSET(VCPU_domain, struct vcpu, domain);
     OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index c25b14dde6..a216c5ca7a 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -165,7 +165,15 @@  restore_all_guest:
         and   %rsi, %rdi
         and   %r9, %rsi
         add   %rcx, %rdi
+
+        /*
+         * The address in the vCPU cr3 is always mapped in the shadow
+         * root_pt virt area.
+         */
+        imul $PAGE_SIZE, VCPU_id(%rbx), %esi
+        movabs $SHADOW_ROOT_PT_VIRT_START, %rcx
         add   %rcx, %rsi
+
         mov   $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx
         mov   root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8
         mov   %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi)