diff mbox series

[V3,01/19] x86: Create per-domain mapping of guest_root_pt

Message ID 20240513111117.68828-2-eliasely@amazon.com (mailing list archive)
State Superseded
Headers show
Series Remove the directmap | expand

Commit Message

Elias El Yandouzi May 13, 2024, 11:10 a.m. UTC
From: Hongyan Xia <hongyxia@amazon.com>

Create a per-domain mapping of PV guest_root_pt as direct map is being
removed.

Note that we do not map and unmap root_pgt for now since it is still a
xenheap page.

Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>

----
    Changes in V3:
        * Rename SHADOW_ROOT
        * Haven't addressed the potentially over-allocation issue as I don't get it

    Changes in V2:
        * Rework the shadow perdomain mapping solution in the follow-up patches

    Changes since Hongyan's version:
        * Remove the final dot in the commit title

Comments

Roger Pau Monné May 13, 2024, 3:27 p.m. UTC | #1
On Mon, May 13, 2024 at 11:10:59AM +0000, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
> 
> Create a per-domain mapping of PV guest_root_pt as direct map is being
> removed.
> 
> Note that we do not map and unmap root_pgt for now since it is still a
> xenheap page.

I'm afraid this needs more context, at least for me to properly
understand.

I think I've figured out what create_perdomain_mapping() is supposed
to do, and I have to admit the interface is very awkward.

Anyway, attempted to provide some feedback.

> 
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>
> 
> ----
>     Changes in V3:
>         * Rename SHADOW_ROOT
>         * Haven't addressed the potentially over-allocation issue as I don't get it
> 
>     Changes in V2:
>         * Rework the shadow perdomain mapping solution in the follow-up patches
> 
>     Changes since Hongyan's version:
>         * Remove the final dot in the commit title
> 
> diff --git a/xen/arch/x86/include/asm/config.h b/xen/arch/x86/include/asm/config.h
> index ab7288cb36..5d710384df 100644
> --- a/xen/arch/x86/include/asm/config.h
> +++ b/xen/arch/x86/include/asm/config.h
> @@ -203,7 +203,7 @@ extern unsigned char boot_edid_info[128];
>  /* Slot 260: per-domain mappings (including map cache). */
>  #define PERDOMAIN_VIRT_START    (PML4_ADDR(260))
>  #define PERDOMAIN_SLOT_MBYTES   (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
> -#define PERDOMAIN_SLOTS         3
> +#define PERDOMAIN_SLOTS         4
>  #define PERDOMAIN_VIRT_SLOT(s)  (PERDOMAIN_VIRT_START + (s) * \
>                                   (PERDOMAIN_SLOT_MBYTES << 20))
>  /* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
> @@ -317,6 +317,14 @@ extern unsigned long xen_phys_start;
>  #define ARG_XLAT_START(v)        \
>      (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
>  
> +/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
> +#define PV_ROOT_PT_MAPPING_VIRT_START   PERDOMAIN_VIRT_SLOT(3)
> +#define PV_ROOT_PT_MAPPING_ENTRIES      MAX_VIRT_CPUS
> +
> +/* The address of a particular VCPU's PV_ROOT_PT */
> +#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
> +    (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))

I know we are not there yet, but I wonder if we need to start having
some non-shared per-cpu mapping area in the page-tables.  Right now
this is shared between all the vCPUs, as it's per-domain space
(instead of per-vCPU).

> +
>  #define ELFSIZE 64
>  
>  #define ARCH_CRASH_SAVE_VMCOREINFO
> diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
> index f5daeb182b..8a97530607 100644
> --- a/xen/arch/x86/include/asm/domain.h
> +++ b/xen/arch/x86/include/asm/domain.h
> @@ -272,6 +272,7 @@ struct time_scale {
>  struct pv_domain
>  {
>      l1_pgentry_t **gdt_ldt_l1tab;
> +    l1_pgentry_t **root_pt_l1tab;
>  
>      atomic_t nr_l4_pages;
>  
> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
> index d968bbbc73..efdf20f775 100644
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -505,6 +505,13 @@ void share_xen_page_with_guest(struct page_info *page, struct domain *d,
>      nrspin_unlock(&d->page_alloc_lock);
>  }
>  
> +#define pv_root_pt_idx(v) \
> +    ((v)->vcpu_id >> PAGETABLE_ORDER)
> +
> +#define pv_root_pt_pte(v) \
> +    ((v)->domain->arch.pv.root_pt_l1tab[pv_root_pt_idx(v)] + \
> +     ((v)->vcpu_id & (L1_PAGETABLE_ENTRIES - 1)))
> +
>  void make_cr3(struct vcpu *v, mfn_t mfn)
>  {
>      struct domain *d = v->domain;
> @@ -524,6 +531,13 @@ void write_ptbase(struct vcpu *v)
>  
>      if ( is_pv_vcpu(v) && v->domain->arch.pv.xpti )
>      {
> +        mfn_t guest_root_pt = _mfn(MASK_EXTR(v->arch.cr3, PAGE_MASK));
> +        l1_pgentry_t *pte = pv_root_pt_pte(v);
> +
> +        ASSERT(v == current);
> +
> +        l1e_write(pte, l1e_from_mfn(guest_root_pt, __PAGE_HYPERVISOR_RO));
> +
>          cpu_info->root_pgt_changed = true;
>          cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
>          if ( new_cr4 & X86_CR4_PCIDE )
> diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
> index 2a445bb17b..1b025986f7 100644
> --- a/xen/arch/x86/pv/domain.c
> +++ b/xen/arch/x86/pv/domain.c
> @@ -288,6 +288,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
>                                1U << GDT_LDT_VCPU_SHIFT);
>  }
>  
> +static int pv_create_root_pt_l1tab(struct vcpu *v)
> +{
> +    return create_perdomain_mapping(v->domain,
> +                                    PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
> +                                    1, v->domain->arch.pv.root_pt_l1tab,
> +                                    NULL);
> +}
> +
> +static void pv_destroy_root_pt_l1tab(struct vcpu *v)

The two 'v' parameters could be const here.

> +
> +{
> +    destroy_perdomain_mapping(v->domain,
> +                              PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v), 1);
> +}
> +
>  void pv_vcpu_destroy(struct vcpu *v)
>  {
>      if ( is_pv_32bit_vcpu(v) )
> @@ -297,6 +312,7 @@ void pv_vcpu_destroy(struct vcpu *v)
>      }
>  
>      pv_destroy_gdt_ldt_l1tab(v);
> +    pv_destroy_root_pt_l1tab(v);
>      XFREE(v->arch.pv.trap_ctxt);
>  }
>  
> @@ -311,6 +327,13 @@ int pv_vcpu_initialise(struct vcpu *v)
>      if ( rc )
>          return rc;
>  
> +    if ( v->domain->arch.pv.xpti )
> +    {
> +        rc = pv_create_root_pt_l1tab(v);
> +        if ( rc )
> +            goto done;
> +    }
> +
>      BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
>                   PAGE_SIZE);
>      v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
> @@ -346,10 +369,12 @@ void pv_domain_destroy(struct domain *d)
>  
>      destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
>                                GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
> +    destroy_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START, PV_ROOT_PT_MAPPING_ENTRIES);

Line too long.  Also see comment below about using d->max_vcpus
instead of MAX_VIRT_CPUS.

>  
>      XFREE(d->arch.pv.cpuidmasks);
>  
>      FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
> +    FREE_XENHEAP_PAGE(d->arch.pv.root_pt_l1tab);
>  }
>  
>  void noreturn cf_check continue_pv_domain(void);
> @@ -371,6 +396,12 @@ int pv_domain_initialise(struct domain *d)
>          goto fail;
>      clear_page(d->arch.pv.gdt_ldt_l1tab);
>  
> +    d->arch.pv.root_pt_l1tab =
> +        alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
> +    if ( !d->arch.pv.root_pt_l1tab )
> +        goto fail;
> +    clear_page(d->arch.pv.root_pt_l1tab);
> +
>      if ( levelling_caps & ~LCAP_faulting &&
>           (d->arch.pv.cpuidmasks = xmemdup(&cpuidmask_defaults)) == NULL )
>          goto fail;
> @@ -381,6 +412,11 @@ int pv_domain_initialise(struct domain *d)
>      if ( rc )
>          goto fail;
>  
> +    rc = create_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START,
> +                                  PV_ROOT_PT_MAPPING_ENTRIES, NULL, NULL);

Why not use d->max_vcpus here, instead of forcing up to MAX_VIRT_CPUS?

By the time pv_domain_initialise() is called max_vcpus should already
be initialized.  AFAICT it doesn't make a difference, because for the
call here only the L3 table is created, as last two parameters are
NULL, but still is more accurate to use max_vcpus.

> +    if ( rc )
> +        goto fail;
> +
>      d->arch.ctxt_switch = &pv_csw;
>  
>      d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
> diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
> index 630bdc3945..c1ae5013af 100644
> --- a/xen/arch/x86/x86_64/asm-offsets.c
> +++ b/xen/arch/x86/x86_64/asm-offsets.c
> @@ -80,6 +80,7 @@ void __dummy__(void)
>  
>  #undef OFFSET_EF
>  
> +    OFFSET(VCPU_id, struct vcpu, vcpu_id);
>      OFFSET(VCPU_processor, struct vcpu, processor);
>      OFFSET(VCPU_domain, struct vcpu, domain);
>      OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
> diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
> index df015589ce..c1377da7a5 100644
> --- a/xen/arch/x86/x86_64/entry.S
> +++ b/xen/arch/x86/x86_64/entry.S
> @@ -162,7 +162,15 @@ FUNC_LOCAL(restore_all_guest)
>          and   %rsi, %rdi
>          and   %r9, %rsi
>          add   %rcx, %rdi
> +
> +        /*
> +         * The address in the vCPU cr3 is always mapped in the per-domain
> +         * pv_root_pt virt area.
> +         */
> +        imul  $PAGE_SIZE, VCPU_id(%rbx), %esi

Aren't some of the previous operations against %rsi now useless since
it gets unconditionally overwritten here?

and   %r9, %rsi
[...]
add   %rcx, %rsi

Thanks, Roger.
Jan Beulich May 14, 2024, 8:03 a.m. UTC | #2
On 13.05.2024 17:27, Roger Pau Monné wrote:
> On Mon, May 13, 2024 at 11:10:59AM +0000, Elias El Yandouzi wrote:
>> @@ -317,6 +317,14 @@ extern unsigned long xen_phys_start;
>>  #define ARG_XLAT_START(v)        \
>>      (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
>>  
>> +/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
>> +#define PV_ROOT_PT_MAPPING_VIRT_START   PERDOMAIN_VIRT_SLOT(3)
>> +#define PV_ROOT_PT_MAPPING_ENTRIES      MAX_VIRT_CPUS
>> +
>> +/* The address of a particular VCPU's PV_ROOT_PT */
>> +#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
>> +    (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
> 
> I know we are not there yet, but I wonder if we need to start having
> some non-shared per-cpu mapping area in the page-tables.  Right now
> this is shared between all the vCPUs, as it's per-domain space
> (instead of per-vCPU).

In turn requiring per-vCPU page tables, posing a problem when a guest
uses the same page tables for multiple vCPU-s.

Jan
Alejandro Vallejo May 14, 2024, 3:46 p.m. UTC | #3
On 14/05/2024 09:03, Jan Beulich wrote:
> On 13.05.2024 17:27, Roger Pau Monné wrote:
>> On Mon, May 13, 2024 at 11:10:59AM +0000, Elias El Yandouzi wrote:
>>> @@ -317,6 +317,14 @@ extern unsigned long xen_phys_start;
>>>  #define ARG_XLAT_START(v)        \
>>>      (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
>>>  
>>> +/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
>>> +#define PV_ROOT_PT_MAPPING_VIRT_START   PERDOMAIN_VIRT_SLOT(3)
>>> +#define PV_ROOT_PT_MAPPING_ENTRIES      MAX_VIRT_CPUS
>>> +
>>> +/* The address of a particular VCPU's PV_ROOT_PT */
>>> +#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
>>> +    (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
>>
>> I know we are not there yet, but I wonder if we need to start having
>> some non-shared per-cpu mapping area in the page-tables.  Right now
>> this is shared between all the vCPUs, as it's per-domain space
>> (instead of per-vCPU).
> 
> In turn requiring per-vCPU page tables, posing a problem when a guest
> uses the same page tables for multiple vCPU-s.
> 
> Jan
> 

True. Having separate page tables per CPU is an unavoidable end goal for
a hypervisor claiming to hold no secrets, however. Otherwise any CPU can
still speculatively read the stacks of other CPUs and take well-timed
glances over mappings set transiently by any other CPU.

Cheers,
Alejandro
Elias El Yandouzi May 14, 2024, 5:15 p.m. UTC | #4
Hi Roger,

On 13/05/2024 16:27, Roger Pau Monné wrote:
>> diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
>> index 2a445bb17b..1b025986f7 100644
>> --- a/xen/arch/x86/pv/domain.c
>> +++ b/xen/arch/x86/pv/domain.c
>> @@ -288,6 +288,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
>>                                 1U << GDT_LDT_VCPU_SHIFT);
>>   }
>>   
>> +static int pv_create_root_pt_l1tab(struct vcpu *v)
>> +{
>> +    return create_perdomain_mapping(v->domain,
>> +                                    PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
>> +                                    1, v->domain->arch.pv.root_pt_l1tab,
>> +                                    NULL);
>> +}
>> +
>> +static void pv_destroy_root_pt_l1tab(struct vcpu *v)
> 
> The two 'v' parameters could be const here.

I could constify the parameters but the functions wouldn't be consistent 
with the two above for gdt/ldt.

>> @@ -381,6 +412,11 @@ int pv_domain_initialise(struct domain *d)
>>       if ( rc )
>>           goto fail;
>>   
>> +    rc = create_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START,
>> +                                  PV_ROOT_PT_MAPPING_ENTRIES, NULL, NULL);
> 
> Why not use d->max_vcpus here, instead of forcing up to MAX_VIRT_CPUS?
> 
> By the time pv_domain_initialise() is called max_vcpus should already
> be initialized.  AFAICT it doesn't make a difference, because for the
> call here only the L3 table is created, as last two parameters are
> NULL, but still is more accurate to use max_vcpus.


There is no particular reason. I think we can safely use d->max_vcpus.
>> diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
>> index df015589ce..c1377da7a5 100644
>> --- a/xen/arch/x86/x86_64/entry.S
>> +++ b/xen/arch/x86/x86_64/entry.S
>> @@ -162,7 +162,15 @@ FUNC_LOCAL(restore_all_guest)
>>           and   %rsi, %rdi
>>           and   %r9, %rsi
>>           add   %rcx, %rdi
>> +
>> +        /*
>> +         * The address in the vCPU cr3 is always mapped in the per-domain
>> +         * pv_root_pt virt area.
>> +         */
>> +        imul  $PAGE_SIZE, VCPU_id(%rbx), %esi
> 
> Aren't some of the previous operations against %rsi now useless since
> it gets unconditionally overwritten here?

I think I can just get rid off of:

     and   %r9, %rsi

> and   %r9, %rsi
> [...]
> add   %rcx, %rsi

The second operation you suggested is actually used to retrieve the VA 
of the PV_ROOT_PT.

Cheers,
Elias
Roger Pau Monné May 15, 2024, 9:05 a.m. UTC | #5
On Tue, May 14, 2024 at 06:15:57PM +0100, Elias El Yandouzi wrote:
> Hi Roger,
> 
> On 13/05/2024 16:27, Roger Pau Monné wrote:
> > > diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
> > > index 2a445bb17b..1b025986f7 100644
> > > --- a/xen/arch/x86/pv/domain.c
> > > +++ b/xen/arch/x86/pv/domain.c
> > > @@ -288,6 +288,21 @@ static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
> > >                                 1U << GDT_LDT_VCPU_SHIFT);
> > >   }
> > > +static int pv_create_root_pt_l1tab(struct vcpu *v)
> > > +{
> > > +    return create_perdomain_mapping(v->domain,
> > > +                                    PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
> > > +                                    1, v->domain->arch.pv.root_pt_l1tab,
> > > +                                    NULL);
> > > +}
> > > +
> > > +static void pv_destroy_root_pt_l1tab(struct vcpu *v)
> > 
> > The two 'v' parameters could be const here.
> 
> I could constify the parameters but the functions wouldn't be consistent
> with the two above for gdt/ldt.

The fact they are not const for the other helpers would also need
fixing at some point IMO, it's best if those are already using the
correct type.

> > > diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
> > > index df015589ce..c1377da7a5 100644
> > > --- a/xen/arch/x86/x86_64/entry.S
> > > +++ b/xen/arch/x86/x86_64/entry.S
> > > @@ -162,7 +162,15 @@ FUNC_LOCAL(restore_all_guest)
> > >           and   %rsi, %rdi
> > >           and   %r9, %rsi
> > >           add   %rcx, %rdi
> > > +
> > > +        /*
> > > +         * The address in the vCPU cr3 is always mapped in the per-domain
> > > +         * pv_root_pt virt area.
> > > +         */
> > > +        imul  $PAGE_SIZE, VCPU_id(%rbx), %esi
> > 
> > Aren't some of the previous operations against %rsi now useless since
> > it gets unconditionally overwritten here?
> 
> I think I can just get rid off of:
> 
>     and   %r9, %rsi
> 
> > and   %r9, %rsi
> > [...]
> > add   %rcx, %rsi
> 
> The second operation you suggested is actually used to retrieve the VA of
> the PV_ROOT_PT.

Oh, yes, sorry, got confused when looking at the source file together
with the diff, it's only the `and` that can be removed.

Thanks, Roger.
diff mbox series

Patch

diff --git a/xen/arch/x86/include/asm/config.h b/xen/arch/x86/include/asm/config.h
index ab7288cb36..5d710384df 100644
--- a/xen/arch/x86/include/asm/config.h
+++ b/xen/arch/x86/include/asm/config.h
@@ -203,7 +203,7 @@  extern unsigned char boot_edid_info[128];
 /* Slot 260: per-domain mappings (including map cache). */
 #define PERDOMAIN_VIRT_START    (PML4_ADDR(260))
 #define PERDOMAIN_SLOT_MBYTES   (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER))
-#define PERDOMAIN_SLOTS         3
+#define PERDOMAIN_SLOTS         4
 #define PERDOMAIN_VIRT_SLOT(s)  (PERDOMAIN_VIRT_START + (s) * \
                                  (PERDOMAIN_SLOT_MBYTES << 20))
 /* Slot 4: mirror of per-domain mappings (for compat xlat area accesses). */
@@ -317,6 +317,14 @@  extern unsigned long xen_phys_start;
 #define ARG_XLAT_START(v)        \
     (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT))
 
+/* pv_root_pt mapping area. The fourth per-domain-mapping sub-area */
+#define PV_ROOT_PT_MAPPING_VIRT_START   PERDOMAIN_VIRT_SLOT(3)
+#define PV_ROOT_PT_MAPPING_ENTRIES      MAX_VIRT_CPUS
+
+/* The address of a particular VCPU's PV_ROOT_PT */
+#define PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v) \
+    (PV_ROOT_PT_MAPPING_VIRT_START + ((v)->vcpu_id * PAGE_SIZE))
+
 #define ELFSIZE 64
 
 #define ARCH_CRASH_SAVE_VMCOREINFO
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index f5daeb182b..8a97530607 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -272,6 +272,7 @@  struct time_scale {
 struct pv_domain
 {
     l1_pgentry_t **gdt_ldt_l1tab;
+    l1_pgentry_t **root_pt_l1tab;
 
     atomic_t nr_l4_pages;
 
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index d968bbbc73..efdf20f775 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -505,6 +505,13 @@  void share_xen_page_with_guest(struct page_info *page, struct domain *d,
     nrspin_unlock(&d->page_alloc_lock);
 }
 
+#define pv_root_pt_idx(v) \
+    ((v)->vcpu_id >> PAGETABLE_ORDER)
+
+#define pv_root_pt_pte(v) \
+    ((v)->domain->arch.pv.root_pt_l1tab[pv_root_pt_idx(v)] + \
+     ((v)->vcpu_id & (L1_PAGETABLE_ENTRIES - 1)))
+
 void make_cr3(struct vcpu *v, mfn_t mfn)
 {
     struct domain *d = v->domain;
@@ -524,6 +531,13 @@  void write_ptbase(struct vcpu *v)
 
     if ( is_pv_vcpu(v) && v->domain->arch.pv.xpti )
     {
+        mfn_t guest_root_pt = _mfn(MASK_EXTR(v->arch.cr3, PAGE_MASK));
+        l1_pgentry_t *pte = pv_root_pt_pte(v);
+
+        ASSERT(v == current);
+
+        l1e_write(pte, l1e_from_mfn(guest_root_pt, __PAGE_HYPERVISOR_RO));
+
         cpu_info->root_pgt_changed = true;
         cpu_info->pv_cr3 = __pa(this_cpu(root_pgt));
         if ( new_cr4 & X86_CR4_PCIDE )
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 2a445bb17b..1b025986f7 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -288,6 +288,21 @@  static void pv_destroy_gdt_ldt_l1tab(struct vcpu *v)
                               1U << GDT_LDT_VCPU_SHIFT);
 }
 
+static int pv_create_root_pt_l1tab(struct vcpu *v)
+{
+    return create_perdomain_mapping(v->domain,
+                                    PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v),
+                                    1, v->domain->arch.pv.root_pt_l1tab,
+                                    NULL);
+}
+
+static void pv_destroy_root_pt_l1tab(struct vcpu *v)
+
+{
+    destroy_perdomain_mapping(v->domain,
+                              PV_ROOT_PT_MAPPING_VCPU_VIRT_START(v), 1);
+}
+
 void pv_vcpu_destroy(struct vcpu *v)
 {
     if ( is_pv_32bit_vcpu(v) )
@@ -297,6 +312,7 @@  void pv_vcpu_destroy(struct vcpu *v)
     }
 
     pv_destroy_gdt_ldt_l1tab(v);
+    pv_destroy_root_pt_l1tab(v);
     XFREE(v->arch.pv.trap_ctxt);
 }
 
@@ -311,6 +327,13 @@  int pv_vcpu_initialise(struct vcpu *v)
     if ( rc )
         return rc;
 
+    if ( v->domain->arch.pv.xpti )
+    {
+        rc = pv_create_root_pt_l1tab(v);
+        if ( rc )
+            goto done;
+    }
+
     BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
                  PAGE_SIZE);
     v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
@@ -346,10 +369,12 @@  void pv_domain_destroy(struct domain *d)
 
     destroy_perdomain_mapping(d, GDT_LDT_VIRT_START,
                               GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
+    destroy_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START, PV_ROOT_PT_MAPPING_ENTRIES);
 
     XFREE(d->arch.pv.cpuidmasks);
 
     FREE_XENHEAP_PAGE(d->arch.pv.gdt_ldt_l1tab);
+    FREE_XENHEAP_PAGE(d->arch.pv.root_pt_l1tab);
 }
 
 void noreturn cf_check continue_pv_domain(void);
@@ -371,6 +396,12 @@  int pv_domain_initialise(struct domain *d)
         goto fail;
     clear_page(d->arch.pv.gdt_ldt_l1tab);
 
+    d->arch.pv.root_pt_l1tab =
+        alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
+    if ( !d->arch.pv.root_pt_l1tab )
+        goto fail;
+    clear_page(d->arch.pv.root_pt_l1tab);
+
     if ( levelling_caps & ~LCAP_faulting &&
          (d->arch.pv.cpuidmasks = xmemdup(&cpuidmask_defaults)) == NULL )
         goto fail;
@@ -381,6 +412,11 @@  int pv_domain_initialise(struct domain *d)
     if ( rc )
         goto fail;
 
+    rc = create_perdomain_mapping(d, PV_ROOT_PT_MAPPING_VIRT_START,
+                                  PV_ROOT_PT_MAPPING_ENTRIES, NULL, NULL);
+    if ( rc )
+        goto fail;
+
     d->arch.ctxt_switch = &pv_csw;
 
     d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
index 630bdc3945..c1ae5013af 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -80,6 +80,7 @@  void __dummy__(void)
 
 #undef OFFSET_EF
 
+    OFFSET(VCPU_id, struct vcpu, vcpu_id);
     OFFSET(VCPU_processor, struct vcpu, processor);
     OFFSET(VCPU_domain, struct vcpu, domain);
     OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index df015589ce..c1377da7a5 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -162,7 +162,15 @@  FUNC_LOCAL(restore_all_guest)
         and   %rsi, %rdi
         and   %r9, %rsi
         add   %rcx, %rdi
+
+        /*
+         * The address in the vCPU cr3 is always mapped in the per-domain
+         * pv_root_pt virt area.
+         */
+        imul  $PAGE_SIZE, VCPU_id(%rbx), %esi
+        movabs $PV_ROOT_PT_MAPPING_VIRT_START, %rcx
         add   %rcx, %rsi
+
         mov   $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx
         mov   root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8
         mov   %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi)