diff mbox series

[v2,6/6] x86/boot: Do not use trampoline for no-real-mode boot paths

Message ID c0e531fc665c9ad7595d853e2ce631a13974c022.camel@infradead.org (mailing list archive)
State New, archived
Headers show
Series [v2,1/6] x86/boot: Remove gratuitous call back into low-memory code | expand

Commit Message

David Woodhouse Aug. 9, 2019, 3:02 p.m. UTC
From: David Woodhouse <dwmw@amazon.co.uk>

Where booted from EFI or with no-real-mode, there is no need to stomp
on low memory with the 16-boot code. Instead, just go straight to
trampoline_protmode_entry() at its physical location within the Xen
image.

For now, the boot code (including the EFI loader path) still determines
what the trampoline_phys address should be. The trampoline is actually
relocated for that address and copied into low memory, from a
relocate_trampoline() call made from __start_xen().

For subsequent AP startup and wakeup, the 32-bit trampoline can't
trivially be used in-place as that region isn't mapped. So copy it
down to low memory too, having relocated it (again) to work from
there.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 xen/arch/x86/acpi/power.c      |  6 +--
 xen/arch/x86/boot/head.S       | 67 ++++++++++++++++------------------
 xen/arch/x86/boot/trampoline.S | 32 ++++++++++++----
 xen/arch/x86/cpu/common.c      |  2 +-
 xen/arch/x86/cpu/intel.c       |  2 +-
 xen/arch/x86/efi/efi-boot.h    | 31 ++--------------
 xen/arch/x86/setup.c           | 43 ++++++++++++++++++++--
 xen/arch/x86/smpboot.c         |  6 +--
 xen/arch/x86/tboot.c           |  6 +--
 xen/arch/x86/x86_64/mm.c       |  2 +-
 xen/include/asm-x86/acpi.h     |  2 +-
 xen/include/asm-x86/config.h   | 10 ++---
 12 files changed, 118 insertions(+), 91 deletions(-)

Comments

Jan Beulich Aug. 12, 2019, 10:55 a.m. UTC | #1
On 09.08.2019 17:02, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> Where booted from EFI or with no-real-mode, there is no need to stomp
> on low memory with the 16-boot code. Instead, just go straight to
> trampoline_protmode_entry() at its physical location within the Xen
> image.
> 
> For now, the boot code (including the EFI loader path) still determines
> what the trampoline_phys address should be. The trampoline is actually
> relocated for that address and copied into low memory, from a
> relocate_trampoline() call made from __start_xen().

I assume this talks about the real mode part of the trampoline, as
opposed to the next paragraph? Would be nice if you made this
explicit.

> For subsequent AP startup and wakeup, the 32-bit trampoline can't
> trivially be used in-place as that region isn't mapped. So copy it
> down to low memory too, having relocated it (again) to work from
> there.

trampoline_protmode_entry gets entered with CR0.PG=0, i.e. at
that point there's not even the question yet of there being a
mapping. Subsequently idle_pg_table gets loaded into CR3. I wonder
if, rather than relocating the 32-bit part of the trampoline, it
wouldn't be better to install a 1:1 mapping into idle_pg_table.
Such a mapping would need to have the G bits clear in order to
not conflict with PV guest mappings of the same linear addresses.

> --- a/xen/arch/x86/acpi/power.c
> +++ b/xen/arch/x86/acpi/power.c
> @@ -152,9 +152,9 @@ static void acpi_sleep_prepare(u32 state)
>           return;
>   
>       if ( acpi_sinfo.vector_width == 32 )
> -        *(uint32_t *)wakeup_vector_va = bootsym_phys(wakeup_start);
> +        *(uint32_t *)wakeup_vector_va = trampsym_phys(wakeup_start);
>       else
> -        *(uint64_t *)wakeup_vector_va = bootsym_phys(wakeup_start);
> +        *(uint64_t *)wakeup_vector_va = trampsym_phys(wakeup_start);
>   }
>   
>   static void acpi_sleep_post(u32 state) {}
> @@ -388,7 +388,7 @@ static void tboot_sleep(u8 sleep_state)
>       g_tboot_shared->acpi_sinfo.wakeup_vector = acpi_sinfo.wakeup_vector;
>       g_tboot_shared->acpi_sinfo.vector_width = acpi_sinfo.vector_width;
>       g_tboot_shared->acpi_sinfo.kernel_s3_resume_vector =
> -                                              bootsym_phys(wakeup_start);
> +                                              trampsym_phys(wakeup_start);

Shouldn't changes like these have happened earlier, when you
introduce the (logical only at that point) distinction between
trampoline pieces?

> @@ -97,7 +100,7 @@ GLOBAL(trampoline_realmode_entry)
>          cld
>          cli
>          lidt    trampsym(idt_48)
> -        lgdt    trampsym(gdt_48)
> +        lgdtl   trampsym(gdt_48)

Stray / unrelated change (and if needed, then also for lidt)?

> @@ -236,11 +239,23 @@ gdt_48: .word   7*8-1
>   
>   /* The first page of trampoline is permanent, the rest boot-time only. */
>   /* Reuse the boot trampoline on the 1st trampoline page as stack for wakeup. */
> -        .equ    wakeup_stack, boot_trampoline_start + PAGE_SIZE
> +        .equ    wakeup_stack, perm_trampoline_start + PAGE_SIZE
>           .global wakeup_stack
>   
> +ENTRY(perm_trampoline_end)
> +
>   /* From here on early boot only. */
>   
> +ENTRY(boot_trampoline_start)
> +
> +        .word   0
> +boot16_idt:
> +        .word   0, 0, 0 # base = limit = 0
> +        .word   0
> +boot16_gdt:
> +        .word   7*8-1
> +        .long   tramp32sym_rel(trampoline_gdt,4)

Can we really not get away without a second copy of these?

> @@ -304,8 +319,8 @@ trampoline_boot_cpu_entry:
>           cli
>   
>           /* Reset GDT and IDT. Some BIOSes clobber GDTR. */
> -        lidt    bootsym(idt_48)
> -        lgdt    bootsym(gdt_48)
> +        lidt    bootsym(boot16_idt)
> +        lgdtl   bootsym(boot16_gdt)

As above - either both should gain a suffix, or neither of them.

> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -682,6 +682,42 @@ static unsigned int __init copy_bios_e820(struct e820entry *map, unsigned int li
>       return n;
>   }
>   
> +extern const s32 __trampoline_rel_start[], __trampoline_rel_stop[];
> +extern const s32 __trampoline32_rel_start[], __trampoline32_rel_stop[];
> +
> +static void __init relocate_trampoline(unsigned long phys)
> +{
> +    const s32 *trampoline_ptr;
> +    uint32_t tramp32_delta = 0;
> +
> +    /* Apply relocations to trampoline. */
> +    for ( trampoline_ptr = __trampoline_rel_start;
> +          trampoline_ptr < __trampoline_rel_stop;
> +          ++trampoline_ptr )
> +        *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) += phys;
> +
> +    tramp32_delta = phys;

Any reason this can't be the initializer of the variable, or the
zero initializer above can't be dropped?

> +    if (!efi_enabled(EFI_LOADER)) {

Style (missing blanks inside the parentheses, and brace to go on
its own line).

> --- a/xen/include/asm-x86/config.h
> +++ b/xen/include/asm-x86/config.h
> @@ -89,12 +89,12 @@
>  
>  #ifndef __ASSEMBLY__
>  extern unsigned long trampoline_phys;
> -#define bootsym_phys(sym)                                 \
> -    (((unsigned long)&(sym)-(unsigned long)&boot_trampoline_start)+trampoline_phys)
> -#define bootsym(sym)                                      \
> +#define trampsym_phys(sym)                                 \
> +    (((unsigned long)&(sym)-(unsigned long)&perm_trampoline_start)+trampoline_phys)
> +#define trampsym(sym)                                      \
>      (*RELOC_HIDE((typeof(&(sym)))__va(__pa(&(sym))),      \
> -                 trampoline_phys-__pa(boot_trampoline_start)))
> -extern char boot_trampoline_start[], boot_trampoline_end[];
> +                 trampoline_phys-__pa(perm_trampoline_start)))

As you're touching these, could you please also insert the missing
blanks around the binary + and - ?

Jan
David Woodhouse Aug. 19, 2019, 3:25 p.m. UTC | #2
On Mon, 2019-08-12 at 12:55 +0200, Jan Beulich wrote:
> On 09.08.2019 17:02, David Woodhouse wrote:
> > From: David Woodhouse <dwmw@amazon.co.uk>
> > 
> > Where booted from EFI or with no-real-mode, there is no need to stomp
> > on low memory with the 16-boot code. Instead, just go straight to
> > trampoline_protmode_entry() at its physical location within the Xen
> > image.
> > 
> > For now, the boot code (including the EFI loader path) still determines
> > what the trampoline_phys address should be. The trampoline is actually
> > relocated for that address and copied into low memory, from a
> > relocate_trampoline() call made from __start_xen().
> 
> I assume this talks about the real mode part of the trampoline, as
> opposed to the next paragraph? Would be nice if you made this
> explicit.

This is the permanent real-mode trampoline used for AP startup and
wakeup, not the real-mode boot code (which the boot code has to have
put there for itself it it wanted it).

I will try to make the commit message clearer; thanks for pointing it
out.

> > For subsequent AP startup and wakeup, the 32-bit trampoline can't
> > trivially be used in-place as that region isn't mapped. So copy it
> > down to low memory too, having relocated it (again) to work from
> > there.
> 
> trampoline_protmode_entry gets entered with CR0.PG=0, i.e. at
> that point there's not even the question yet of there being a
> mapping. Subsequently idle_pg_table gets loaded into CR3. I wonder
> if, rather than relocating the 32-bit part of the trampoline, it
> wouldn't be better to install a 1:1 mapping into idle_pg_table.
> Such a mapping would need to have the G bits clear in order to
> not conflict with PV guest mappings of the same linear addresses.

Yeah, I tried making that happen. It made me sad. This seemed to be
simpler and less fragile.

> > --- a/xen/arch/x86/acpi/power.c
> > +++ b/xen/arch/x86/acpi/power.c
> > @@ -152,9 +152,9 @@ static void acpi_sleep_prepare(u32 state)
> >           return;
> >   
> >       if ( acpi_sinfo.vector_width == 32 )
> > -        *(uint32_t *)wakeup_vector_va = bootsym_phys(wakeup_start);
> > +        *(uint32_t *)wakeup_vector_va = trampsym_phys(wakeup_start);
> >       else
> > -        *(uint64_t *)wakeup_vector_va = bootsym_phys(wakeup_start);
> > +        *(uint64_t *)wakeup_vector_va = trampsym_phys(wakeup_start);
> >   }
> >   
> >   static void acpi_sleep_post(u32 state) {}
> > @@ -388,7 +388,7 @@ static void tboot_sleep(u8 sleep_state)
> >       g_tboot_shared->acpi_sinfo.wakeup_vector = acpi_sinfo.wakeup_vector;
> >       g_tboot_shared->acpi_sinfo.vector_width = acpi_sinfo.vector_width;
> >       g_tboot_shared->acpi_sinfo.kernel_s3_resume_vector =
> > -                                              bootsym_phys(wakeup_start);
> > +                                              trampsym_phys(wakeup_start);
> 
> Shouldn't changes like these have happened earlier, when you
> introduce the (logical only at that point) distinction between
> trampoline pieces?

That was in assembler code. This is C, which never had to be that
involved with the distinction. But now that all the dust has settled,
I'm making it consistent, using 'trampsym' for stuff in the permanent
trampoline just like the asm code does.


> > @@ -97,7 +100,7 @@ GLOBAL(trampoline_realmode_entry)
> >          cld
> >          cli
> >          lidt    trampsym(idt_48)
> > -        lgdt    trampsym(gdt_48)
> > +        lgdtl   trampsym(gdt_48)
> 
> Stray / unrelated change (and if needed, then also for lidt)?

The difference between 16bit l.dt and 32-bit l.dtl is that the former
only loads 24 bits of the actual table address (trampoline_gdt in this
case).

Thus, when trampoline_gdt is being used in-place, as it is during early
boot, and *if* the Xen image is loaded higher than 16MiB, lgdt doesn't
work. That's half a day of my life I want back.

It doesn't matter for lidt because we're just loading an empty limit
and pointer there, and we don't care about bits 24-31 of a zero value.

> > @@ -236,11 +239,23 @@ gdt_48: .word   7*8-1
> >   
> >   /* The first page of trampoline is permanent, the rest boot-time only. */
> >   /* Reuse the boot trampoline on the 1st trampoline page as stack for wakeup. */
> > -        .equ    wakeup_stack, boot_trampoline_start + PAGE_SIZE
> > +        .equ    wakeup_stack, perm_trampoline_start + PAGE_SIZE
> >           .global wakeup_stack
> >   
> > +ENTRY(perm_trampoline_end)
> > +
> >   /* From here on early boot only. */
> >   
> > +ENTRY(boot_trampoline_start)
> > +
> > +        .word   0
> > +boot16_idt:
> > +        .word   0, 0, 0 # base = limit = 0
> > +        .word   0
> > +boot16_gdt:
> > +        .word   7*8-1
> > +        .long   tramp32sym_rel(trampoline_gdt,4)
> 
> Can we really not get away without a second copy of these?

Probably, but my judgement was that the complexity and the pain of
doing so would exceed the benefit. I'll take another look at doing so.

> > @@ -304,8 +319,8 @@ trampoline_boot_cpu_entry:
> >           cli
> >   
> >           /* Reset GDT and IDT. Some BIOSes clobber GDTR. */
> > -        lidt    bootsym(idt_48)
> > -        lgdt    bootsym(gdt_48)
> > +        lidt    bootsym(boot16_idt)
> > +        lgdtl   bootsym(boot16_gdt)
> 
> As above - either both should gain a suffix, or neither of them.
> 
> > --- a/xen/arch/x86/setup.c
> > +++ b/xen/arch/x86/setup.c
> > @@ -682,6 +682,42 @@ static unsigned int __init copy_bios_e820(struct e820entry *map, unsigned int li
> >       return n;
> >   }
> >   
> > +extern const s32 __trampoline_rel_start[], __trampoline_rel_stop[];
> > +extern const s32 __trampoline32_rel_start[], __trampoline32_rel_stop[];
> > +
> > +static void __init relocate_trampoline(unsigned long phys)
> > +{
> > +    const s32 *trampoline_ptr;
> > +    uint32_t tramp32_delta = 0;
> > +
> > +    /* Apply relocations to trampoline. */
> > +    for ( trampoline_ptr = __trampoline_rel_start;
> > +          trampoline_ptr < __trampoline_rel_stop;
> > +          ++trampoline_ptr )
> > +        *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) += phys;
> > +
> > +    tramp32_delta = phys;
> 
> Any reason this can't be the initializer of the variable, or the
> zero initializer above can't be dropped?

I can't think of one. I think I quite like the initial setting of
tramp32_delta=phys to live *right* above the subsequent if(something)
tramp32_delta-=something, to make it very clear what that calculation
is.

So maybe I'll just drop the pointless =0 initialiser.

> > +    if (!efi_enabled(EFI_LOADER)) {
> 
> Style (missing blanks inside the parentheses, and brace to go on
> its own line).

Ack. You can take the Linux hacker out of the Linux kernel but...

> > --- a/xen/include/asm-x86/config.h
> > +++ b/xen/include/asm-x86/config.h
> > @@ -89,12 +89,12 @@
> >  
> >  #ifndef __ASSEMBLY__
> >  extern unsigned long trampoline_phys;
> > -#define bootsym_phys(sym)                                 \
> > -    (((unsigned long)&(sym)-(unsigned long)&boot_trampoline_start)+trampoline_phys)
> > -#define bootsym(sym)                                      \
> > +#define trampsym_phys(sym)                                 \
> > +    (((unsigned long)&(sym)-(unsigned long)&perm_trampoline_start)+trampoline_phys)
> > +#define trampsym(sym)                                      \
> >      (*RELOC_HIDE((typeof(&(sym)))__va(__pa(&(sym))),      \
> > -                 trampoline_phys-__pa(boot_trampoline_start)))
> > -extern char boot_trampoline_start[], boot_trampoline_end[];
> > +                 trampoline_phys-__pa(perm_trampoline_start)))
> 
> As you're touching these, could you please also insert the missing
> blanks around the binary + and - ?

Will do. Thanks.
Jan Beulich Aug. 27, 2019, 9:07 a.m. UTC | #3
On 19.08.2019 17:25, David Woodhouse wrote:
> On Mon, 2019-08-12 at 12:55 +0200, Jan Beulich wrote:
>> On 09.08.2019 17:02, David Woodhouse wrote:
>>> @@ -97,7 +100,7 @@ GLOBAL(trampoline_realmode_entry)
>>>           cld
>>>           cli
>>>           lidt    trampsym(idt_48)
>>> -        lgdt    trampsym(gdt_48)
>>> +        lgdtl   trampsym(gdt_48)
>>
>> Stray / unrelated change (and if needed, then also for lidt)?
> 
> The difference between 16bit l.dt and 32-bit l.dtl is that the former
> only loads 24 bits of the actual table address (trampoline_gdt in this
> case).
> 
> Thus, when trampoline_gdt is being used in-place, as it is during early
> boot, and *if* the Xen image is loaded higher than 16MiB, lgdt doesn't
> work. That's half a day of my life I want back.

But isn't this an issue even independent of your series? I.e. doesn't
this want to be fixed in a separate (to be backported) patch?

Jan
David Woodhouse Aug. 27, 2019, 9:12 a.m. UTC | #4
> On 19.08.2019 17:25, David Woodhouse wrote:
>> On Mon, 2019-08-12 at 12:55 +0200, Jan Beulich wrote:
>>> On 09.08.2019 17:02, David Woodhouse wrote:
>>>> @@ -97,7 +100,7 @@ GLOBAL(trampoline_realmode_entry)
>>>>           cld
>>>>           cli
>>>>           lidt    trampsym(idt_48)
>>>> -        lgdt    trampsym(gdt_48)
>>>> +        lgdtl   trampsym(gdt_48)
>>>
>>> Stray / unrelated change (and if needed, then also for lidt)?
>>
>> The difference between 16bit l.dt and 32-bit l.dtl is that the former
>> only loads 24 bits of the actual table address (trampoline_gdt in this
>> case).
>>
>> Thus, when trampoline_gdt is being used in-place, as it is during early
>> boot, and *if* the Xen image is loaded higher than 16MiB, lgdt doesn't
>> work. That's half a day of my life I want back.
>
> But isn't this an issue even independent of your series? I.e. doesn't
> this want to be fixed in a separate (to be backported) patch?

Before my series it isn't used in place in the Xen image; it's also
(mostly gratuitously) copied to low memory.
diff mbox series

Patch

diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c
index aecc754fdb..d2a355429e 100644
--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -152,9 +152,9 @@  static void acpi_sleep_prepare(u32 state)
         return;
 
     if ( acpi_sinfo.vector_width == 32 )
-        *(uint32_t *)wakeup_vector_va = bootsym_phys(wakeup_start);
+        *(uint32_t *)wakeup_vector_va = trampsym_phys(wakeup_start);
     else
-        *(uint64_t *)wakeup_vector_va = bootsym_phys(wakeup_start);
+        *(uint64_t *)wakeup_vector_va = trampsym_phys(wakeup_start);
 }
 
 static void acpi_sleep_post(u32 state) {}
@@ -388,7 +388,7 @@  static void tboot_sleep(u8 sleep_state)
     g_tboot_shared->acpi_sinfo.wakeup_vector = acpi_sinfo.wakeup_vector;
     g_tboot_shared->acpi_sinfo.vector_width = acpi_sinfo.vector_width;
     g_tboot_shared->acpi_sinfo.kernel_s3_resume_vector =
-                                              bootsym_phys(wakeup_start);
+                                              trampsym_phys(wakeup_start);
 
     switch ( sleep_state )
     {
diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index 104eb0eb3c..2268ec99ed 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -697,16 +697,23 @@  trampoline_setup:
         lea     __PAGE_HYPERVISOR+sym_esi(l1_identmap),%edi
         mov     %edi,sym_fs(l2_bootmap)
 
-        /* Apply relocations to bootstrap trampoline. */
-        mov     sym_fs(trampoline_phys),%edx
-        mov     $sym_offs(__trampoline_rel_start),%edi
-1:
-        mov     %fs:(%edi),%eax
-        add     %edx,%fs:(%edi,%eax)
-        add     $4,%edi
-        cmp     $sym_offs(__trampoline_rel_stop),%edi
-        jb      1b
+        /* Do not parse command line on EFI platform here. */
+        cmpb    $0,sym_fs(efi_platform)
+        jnz     1f
 
+        /* Bail if there is no command line to parse. */
+        mov     sym_fs(multiboot_ptr),%ebx
+        testl   $MBI_CMDLINE,MB_flags(%ebx)
+        jz      1f
+
+        lea     sym_esi(early_boot_opts),%eax
+        push    %eax
+        pushl   MB_cmdline(%ebx)
+        call    cmdline_parse_early
+
+1:
+        /* Apply relocations to 32-bit trampoline for execution in place. */
+        lea     sym_esi(perm_trampoline_start),%edx
         mov     $sym_offs(__trampoline32_rel_start),%edi
 1:
         mov     %fs:(%edi),%eax
@@ -715,6 +722,21 @@  trampoline_setup:
         cmp     $sym_offs(__trampoline32_rel_stop),%edi
         jb      1b
 
+        cmp     $0,sym_esi(skip_realmode)
+        jz      .Ldo_realmode
+
+        /* Go directly to trampoline_protmode_entry at its physical address */
+        lea     trampoline_protmode_entry-__XEN_VIRT_START(%esi),%eax
+        pushl   $BOOT_CS32
+        push    %eax
+
+        /* EBX == 0 indicates we are the BP (Boot Processor). */
+        xor     %ebx,%ebx
+        retl
+
+.Ldo_realmode:
+        /* Apply relocations to 16-bit boot code. */
+        mov     sym_fs(trampoline_phys),%edx
         mov     $sym_offs(__bootsym_rel_start),%edi
 1:
         mov     %fs:(%edi),%eax
@@ -744,35 +766,12 @@  trampoline_setup:
         cmp     $sym_offs(__bootdatasym_rel_stop),%edi
         jb      1b
 
-        /* Do not parse command line on EFI platform here. */
-        cmpb    $0,sym_fs(efi_platform)
-        jnz     1f
-
-        /* Bail if there is no command line to parse. */
-        mov     sym_fs(multiboot_ptr),%ebx
-        testl   $MBI_CMDLINE,MB_flags(%ebx)
-        jz      1f
-
-        lea     sym_esi(early_boot_opts),%eax
-        push    %eax
-        pushl   MB_cmdline(%ebx)
-        call    cmdline_parse_early
-
-1:
         /* Switch to low-memory stack which lives at the end of trampoline region. */
         mov     sym_fs(trampoline_phys),%edi
         lea     TRAMPOLINE_SPACE+TRAMPOLINE_STACK_SPACE(%edi),%esp
-        cmpb    $0, sym_fs(skip_realmode)
-        jz      1f
-        /* If no-real-mode, jump straight to trampoline_protmode_entry */
-        lea     trampoline_protmode_entry-boot_trampoline_start(%edi),%eax
-        /* EBX == 0 indicates we are the BP (Boot Processor). */
-        xor     %ebx,%ebx
-        jmp     2f
-1:
+
         /* Go via 16-bit code in trampoline_boot_cpu_entry */
         lea     trampoline_boot_cpu_entry-boot_trampoline_start(%edi),%eax
-2:
         pushl   $BOOT_CS32
         push    %eax
 
@@ -795,8 +794,6 @@  cmdline_parse_early:
 reloc:
 #include "reloc.S"
 
-ENTRY(boot_trampoline_start)
 #include "trampoline.S"
-ENTRY(boot_trampoline_end)
 
 #include "x86_64.S"
diff --git a/xen/arch/x86/boot/trampoline.S b/xen/arch/x86/boot/trampoline.S
index b5517a44bb..7b9ebb6172 100644
--- a/xen/arch/x86/boot/trampoline.S
+++ b/xen/arch/x86/boot/trampoline.S
@@ -60,7 +60,7 @@  GLOBAL(bootdata_start)
         .popsection
 
 #undef trampsym
-#define trampsym(s) ((s)-boot_trampoline_start)
+#define trampsym(s) ((s)-perm_trampoline_start)
 
 #define trampsym_rel(sym, off, opnd...)    \
         trampsym(sym),##opnd;              \
@@ -70,7 +70,7 @@  GLOBAL(bootdata_start)
         .popsection
 
 #undef tramp32sym
-#define tramp32sym(s) ((s)-boot_trampoline_start)
+#define tramp32sym(s) ((s)-perm_trampoline_start)
 
 #define tramp32sym_rel(sym, off, opnd...)  \
         tramp32sym(sym),##opnd;            \
@@ -83,6 +83,8 @@  GLOBAL(bootdata_start)
 
         .code16
 
+ENTRY(perm_trampoline_start)
+
 /*
  * do_boot_cpu() programs the Startup-IPI to point here.  Due to the SIPI
  * format, the relocated entrypoint must be 4k aligned.
@@ -90,6 +92,7 @@  GLOBAL(bootdata_start)
  * It is entered in Real Mode, with %cs = trampoline_realmode_entry >> 4 and
  * %ip = 0.
  */
+
 GLOBAL(trampoline_realmode_entry)
         mov     %cs,%ax
         mov     %ax,%ds
@@ -97,7 +100,7 @@  GLOBAL(trampoline_realmode_entry)
         cld
         cli
         lidt    trampsym(idt_48)
-        lgdt    trampsym(gdt_48)
+        lgdtl   trampsym(gdt_48)
         mov     $1,%bl                    # EBX != 0 indicates we are an AP
         xor     %ax, %ax
         inc     %ax
@@ -236,11 +239,23 @@  gdt_48: .word   7*8-1
 
 /* The first page of trampoline is permanent, the rest boot-time only. */
 /* Reuse the boot trampoline on the 1st trampoline page as stack for wakeup. */
-        .equ    wakeup_stack, boot_trampoline_start + PAGE_SIZE
+        .equ    wakeup_stack, perm_trampoline_start + PAGE_SIZE
         .global wakeup_stack
 
+ENTRY(perm_trampoline_end)
+
 /* From here on early boot only. */
 
+ENTRY(boot_trampoline_start)
+
+        .word   0
+boot16_idt:
+        .word   0, 0, 0 # base = limit = 0
+        .word   0
+boot16_gdt:
+        .word   7*8-1
+        .long   tramp32sym_rel(trampoline_gdt,4)
+
         .code32
 trampoline_boot_cpu_entry:
         /* Load pseudo-real-mode segments. */
@@ -304,8 +319,8 @@  trampoline_boot_cpu_entry:
         cli
 
         /* Reset GDT and IDT. Some BIOSes clobber GDTR. */
-        lidt    bootsym(idt_48)
-        lgdt    bootsym(gdt_48)
+        lidt    bootsym(boot16_idt)
+        lgdtl   bootsym(boot16_gdt)
 
         /* Enter protected mode, and flush insn queue. */
         xor     %ax,%ax
@@ -343,7 +358,8 @@  trampoline_boot_cpu_entry:
         xor     %ebx,%ebx
 
         /* Jump to the common bootstrap entry point. */
-        jmp     trampoline_protmode_entry
+        mov     $tramp32sym_rel(trampoline_protmode_entry,4,%eax)
+        jmp     *%eax
 
 #include "video.h"
 
@@ -379,6 +395,8 @@  rm_idt: .word   256*4-1, 0, 0
 #include "video.S"
 #endif
 
+ENTRY(boot_trampoline_end)
+
         .pushsection .data.boot16, "aw", @progbits
 GLOBAL(bootdata_end)
         .popsection
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 7478e21177..d202216739 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -389,7 +389,7 @@  static void generic_identify(struct cpuinfo_x86 *c)
 		      &c->x86_capability[cpufeat_word(X86_FEATURE_LAHF_LM)],
 		      &c->x86_capability[cpufeat_word(X86_FEATURE_SYSCALL)]);
 	if (c == &boot_cpu_data)
-		bootsym(cpuid_ext_features) =
+		trampsym(cpuid_ext_features) =
 			c->x86_capability[cpufeat_word(X86_FEATURE_NX)];
 
 	if (c->extended_cpuid_level >= 0x80000004)
diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index 5356a6ae10..1970eb1848 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -269,7 +269,7 @@  static void early_init_intel(struct cpuinfo_x86 *c)
 				 MSR_IA32_MISC_ENABLE_XD_DISABLE);
 	if (disable) {
 		wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
-		bootsym(trampoline_misc_enable_off) |= disable;
+		trampsym(trampoline_misc_enable_off) |= disable;
 	}
 
 	if (disable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID)
diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h
index fc2ea554b5..e7418894a4 100644
--- a/xen/arch/x86/efi/efi-boot.h
+++ b/xen/arch/x86/efi/efi-boot.h
@@ -98,29 +98,6 @@  static void __init efi_arch_relocate_image(unsigned long delta)
     }
 }
 
-extern const s32 __trampoline_rel_start[], __trampoline_rel_stop[];
-extern const s32 __trampoline32_rel_start[], __trampoline32_rel_stop[];
-
-static void __init relocate_trampoline(unsigned long phys)
-{
-    const s32 *trampoline_ptr;
-
-    trampoline_phys = phys;
-
-    if ( !efi_enabled(EFI_LOADER) )
-        return;
-
-    /* Apply relocations to trampoline. */
-    for ( trampoline_ptr = __trampoline_rel_start;
-          trampoline_ptr < __trampoline_rel_stop;
-          ++trampoline_ptr )
-        *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) += phys;
-    for ( trampoline_ptr = __trampoline32_rel_start;
-          trampoline_ptr < __trampoline32_rel_stop;
-          ++trampoline_ptr )
-        *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) += phys;
-}
-
 static void __init place_string(u32 *addr, const char *s)
 {
     char *alloc = NULL;
@@ -223,7 +200,7 @@  static void __init efi_arch_pre_exit_boot(void)
     {
         if ( !cfg.addr )
             blexit(L"No memory for trampoline");
-        relocate_trampoline(cfg.addr);
+        trampoline_phys = cfg.addr;
     }
 }
 
@@ -232,7 +209,6 @@  static void __init noreturn efi_arch_post_exit_boot(void)
     u64 cr4 = XEN_MINIMAL_CR4 & ~X86_CR4_PGE, efer;
 
     efi_arch_relocate_image(__XEN_VIRT_START - xen_phys_start);
-    memcpy((void *)trampoline_phys, boot_trampoline_start, cfg.size);
 
     /* Set system registers and transfer control. */
     asm volatile("pushq $0\n\tpopfq");
@@ -566,14 +542,14 @@  static void __init efi_arch_memory_setup(void)
     cfg.addr = 0x100000;
 
     if ( efi_enabled(EFI_LOADER) )
-        cfg.size = boot_trampoline_end - boot_trampoline_start;
+        cfg.size = perm_trampoline_end - perm_trampoline_start;
     else
         cfg.size = TRAMPOLINE_SPACE + TRAMPOLINE_STACK_SPACE;
 
     status = efi_bs->AllocatePages(AllocateMaxAddress, EfiLoaderData,
                                    PFN_UP(cfg.size), &cfg.addr);
     if ( status == EFI_SUCCESS )
-        relocate_trampoline(cfg.addr);
+        trampoline_phys = cfg.addr;
     else
     {
         cfg.addr = 0;
@@ -665,7 +641,6 @@  static void __init efi_arch_load_addr_check(EFI_LOADED_IMAGE *loaded_image)
         blexit(L"Xen must be loaded below 4Gb.");
     if ( xen_phys_start & ((1 << L2_PAGETABLE_SHIFT) - 1) )
         blexit(L"Xen must be loaded at a 2Mb boundary.");
-    trampoline_xen_phys_start = xen_phys_start;
 }
 
 static bool __init efi_arch_use_config_file(EFI_SYSTEM_TABLE *SystemTable)
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 101c9dd272..eb8bc7e33b 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -682,6 +682,42 @@  static unsigned int __init copy_bios_e820(struct e820entry *map, unsigned int li
     return n;
 }
 
+extern const s32 __trampoline_rel_start[], __trampoline_rel_stop[];
+extern const s32 __trampoline32_rel_start[], __trampoline32_rel_stop[];
+
+static void __init relocate_trampoline(unsigned long phys)
+{
+    const s32 *trampoline_ptr;
+    uint32_t tramp32_delta = 0;
+
+    /* Apply relocations to trampoline. */
+    for ( trampoline_ptr = __trampoline_rel_start;
+          trampoline_ptr < __trampoline_rel_stop;
+          ++trampoline_ptr )
+        *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) += phys;
+
+    tramp32_delta = phys;
+    if (!efi_enabled(EFI_LOADER)) {
+        /*
+         * The non-EFI boot code uses the 32-bit trampoline in place
+         * so will have relocated it to the physical address of
+         * perm_trampoline_start already. Undo that as it needs to
+         * run from low memory for AP startup, because the Xen
+         * physical address range won't be mapped.
+         */
+        tramp32_delta -= trampoline_xen_phys_start;
+        tramp32_delta -= (unsigned long)(perm_trampoline_start - __XEN_VIRT_START);
+    }
+    for ( trampoline_ptr = __trampoline32_rel_start;
+          trampoline_ptr < __trampoline32_rel_stop;
+          ++trampoline_ptr )
+        *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) += tramp32_delta;
+    trampoline_xen_phys_start = xen_phys_start;
+
+    memcpy(trampsym(perm_trampoline_start), perm_trampoline_start,
+           perm_trampoline_end - perm_trampoline_start);
+}
+
 void __init noreturn __start_xen(unsigned long mbi_p)
 {
     char *memmap_type = NULL;
@@ -1076,7 +1112,6 @@  void __init noreturn __start_xen(unsigned long mbi_p)
             /* Select relocation address. */
             e = end - reloc_size;
             xen_phys_start = e;
-            bootsym(trampoline_xen_phys_start) = e;
 
             /*
              * No PTEs pointing above this address are candidates for relocation.
@@ -1509,6 +1544,8 @@  void __init noreturn __start_xen(unsigned long mbi_p)
     else
         end_boot_allocator();
 
+    relocate_trampoline(trampoline_phys);
+
     system_state = SYS_STATE_boot;
     /*
      * No calls involving ACPI code should go between the setting of
@@ -1879,8 +1916,8 @@  int __hwdom_init xen_in_range(unsigned long mfn)
     if ( !xen_regions[0].s )
     {
         /* S3 resume code (and other real mode trampoline code) */
-        xen_regions[region_s3].s = bootsym_phys(boot_trampoline_start);
-        xen_regions[region_s3].e = bootsym_phys(boot_trampoline_end);
+        xen_regions[region_s3].s = trampsym_phys(perm_trampoline_start);
+        xen_regions[region_s3].e = trampsym_phys(perm_trampoline_end);
 
         /*
          * This needs to remain in sync with the uses of the same symbols in
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 65e9ceeece..0410331a9e 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -47,7 +47,7 @@ 
 #include <asm/tboot.h>
 #include <mach_apic.h>
 
-#define setup_trampoline()    (bootsym_phys(trampoline_realmode_entry))
+#define setup_trampoline()    (trampsym_phys(trampoline_realmode_entry))
 
 unsigned long __read_mostly trampoline_phys;
 
@@ -598,7 +598,7 @@  static int do_boot_cpu(int apicid, int cpu)
         {
             boot_error = 1;
             smp_mb();
-            if ( bootsym(trampoline_cpu_started) == 0xA5 )
+            if ( trampsym(trampoline_cpu_started) == 0xA5 )
                 /* trampoline started but...? */
                 printk("Stuck ??\n");
             else
@@ -614,7 +614,7 @@  static int do_boot_cpu(int apicid, int cpu)
     }
 
     /* mark "stuck" area as not stuck */
-    bootsym(trampoline_cpu_started) = 0;
+    trampsym(trampoline_cpu_started) = 0;
     smp_mb();
 
     return rc;
diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c
index 325d94d23a..daf19d346a 100644
--- a/xen/arch/x86/tboot.c
+++ b/xen/arch/x86/tboot.c
@@ -369,9 +369,9 @@  void tboot_shutdown(uint32_t shutdown_type)
          */
         g_tboot_shared->num_mac_regions = 3;
         /* S3 resume code (and other real mode trampoline code) */
-        g_tboot_shared->mac_regions[0].start = bootsym_phys(boot_trampoline_start);
-        g_tboot_shared->mac_regions[0].size = bootsym_phys(boot_trampoline_end) -
-                                              bootsym_phys(boot_trampoline_start);
+        g_tboot_shared->mac_regions[0].start = trampsym_phys(perm_trampoline_start);
+        g_tboot_shared->mac_regions[0].size = trampsym_phys(perm_trampoline_end) -
+                                              trampsym_phys(perm_trampoline_start);
         /* hypervisor .text + .rodata */
         g_tboot_shared->mac_regions[1].start = (uint64_t)__pa(&_stext);
         g_tboot_shared->mac_regions[1].size = __pa(&__2M_rodata_end) -
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 149cc4f7b5..05d476c423 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -697,7 +697,7 @@  void __init zap_low_mappings(void)
 
     /* Replace with mapping of the boot trampoline only. */
     map_pages_to_xen(trampoline_phys, maddr_to_mfn(trampoline_phys),
-                     PFN_UP(boot_trampoline_end - boot_trampoline_start),
+                     PFN_UP(perm_trampoline_end - perm_trampoline_start),
                      __PAGE_HYPERVISOR);
 }
 
diff --git a/xen/include/asm-x86/acpi.h b/xen/include/asm-x86/acpi.h
index 7032f3a001..8c5381aa47 100644
--- a/xen/include/asm-x86/acpi.h
+++ b/xen/include/asm-x86/acpi.h
@@ -106,7 +106,7 @@  extern int acpi_scan_nodes(u64 start, u64 end);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 
 extern struct acpi_sleep_info acpi_sinfo;
-#define acpi_video_flags bootsym(video_flags)
+#define acpi_video_flags trampsym(video_flags)
 struct xenpf_enter_acpi_sleep;
 extern int acpi_enter_sleep(struct xenpf_enter_acpi_sleep *sleep);
 extern int acpi_enter_state(u32 state);
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index ada8c7b4f6..78634296b5 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -89,12 +89,12 @@ 
 
 #ifndef __ASSEMBLY__
 extern unsigned long trampoline_phys;
-#define bootsym_phys(sym)                                 \
-    (((unsigned long)&(sym)-(unsigned long)&boot_trampoline_start)+trampoline_phys)
-#define bootsym(sym)                                      \
+#define trampsym_phys(sym)                                 \
+    (((unsigned long)&(sym)-(unsigned long)&perm_trampoline_start)+trampoline_phys)
+#define trampsym(sym)                                      \
     (*RELOC_HIDE((typeof(&(sym)))__va(__pa(&(sym))),      \
-                 trampoline_phys-__pa(boot_trampoline_start)))
-extern char boot_trampoline_start[], boot_trampoline_end[];
+                 trampoline_phys-__pa(perm_trampoline_start)))
+extern char perm_trampoline_start[], perm_trampoline_end[];
 extern char trampoline_realmode_entry[];
 extern unsigned int trampoline_xen_phys_start;
 extern unsigned char trampoline_cpu_started;