diff mbox series

[4/4] x86/desc: Build boot_{,compat_}gdt[] in C

Message ID 20190805124301.12887-5-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show
Series x86/boot: Cleanup | expand

Commit Message

Andrew Cooper Aug. 5, 2019, 12:43 p.m. UTC
... where we can at least get the compiler to fill in the surrounding space
without having to do it manually.  This also reults in the symbols having
proper type/size information in the debug symbols.

Reorder 'raw' in the seg_desc_t union to allow for easier initialisation.

Leave a comment explaining the various restrictions we have on altering the
GDT layout.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Wei Liu <wl@xen.org>
CC: Roger Pau Monné <roger.pau@citrix.com>

There is plenty more cleanup which can be done in the future.  As we are
64-bit, there is no need for load_TR() to keep the TSS in sync between the two
GDTs, which means it can drop all sgdt/lgdt instructions.  Also,
__HYPERVISOR_CS32 is unused and could be dropped.

As is demonstrated by the required includes, asm/desc.h is a tangle in need of
some cleanup.

The SYSEXIT GDT requirements are:
  R0 long code, R0 data, R3 compat code, R3 data, R3 long code, R3 data.

We could make Xen compatible by shifting the R0 code/data down by one slot,
moving R0 compat code elsewhere and replacing it with another R3 data.

However, this seems like a fairly pointless exercise, as the only thing it
will do is change the magic numbers which developers have become accustom to
seeing in backtraces.
---
 xen/arch/x86/Makefile      |  1 +
 xen/arch/x86/boot/head.S   |  1 -
 xen/arch/x86/boot/x86_64.S | 33 --------------------
 xen/arch/x86/desc.c        | 75 ++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/desc.h |  2 +-
 5 files changed, 77 insertions(+), 35 deletions(-)
 create mode 100644 xen/arch/x86/desc.c

Comments

Roger Pau Monné Aug. 6, 2019, 3:04 p.m. UTC | #1
On Mon, Aug 05, 2019 at 01:43:01PM +0100, Andrew Cooper wrote:
> ... where we can at least get the compiler to fill in the surrounding space
> without having to do it manually.  This also reults in the symbols having
> proper type/size information in the debug symbols.
> 
> Reorder 'raw' in the seg_desc_t union to allow for easier initialisation.
> 
> Leave a comment explaining the various restrictions we have on altering the
> GDT layout.
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.
Jan Beulich Aug. 6, 2019, 3:48 p.m. UTC | #2
On 05.08.2019 14:43, Andrew Cooper wrote:
> --- a/xen/arch/x86/boot/x86_64.S
> +++ b/xen/arch/x86/boot/x86_64.S
> @@ -43,44 +43,11 @@ ENTRY(__high_start)
>   multiboot_ptr:
>           .long   0
>   
> -        .word   0
> -GLOBAL(boot_gdtr)
> -        .word   LAST_RESERVED_GDT_BYTE
> -        .quad   boot_gdt - FIRST_RESERVED_GDT_BYTE

Just as a remark: The intentional misalignment here is lost with
the transition to C.

> --- /dev/null
> +++ b/xen/arch/x86/desc.c
> @@ -0,0 +1,75 @@
> +#include <xen/types.h>
> +#include <xen/lib.h>
> +#include <xen/percpu.h>
> +#include <xen/mm.h>
> +
> +#include <asm/desc.h>
> +
> +/*
> + * Native and Compat GDTs used by Xen.
> + *
> + * The R1 and R3 descriptors are fixed in Xen's ABI for PV guests.  All other
> + * descriptors are in principle variable, with the following restrictions.
> + *
> + * All R0 descriptors must line up in both GDTs to allow for correct
> + * interrupt/exception handling.
> + *
> + * The SYSCALL/SYSRET GDT layout requires:
> + *  - R0 long mode code followed by R0 data.
> + *  - R3 compat code, followed by R3 data, followed by R3 long mode code.
> + *
> + * The SYSENTER GDT layout requirements are compatible with SYSCALL.  Xen does
> + * not use the SYSEXIT instruction, and does not provide a compatible GDT.
> + *
> + * These tables are used directly by CPU0, and used as the template for the
> + * GDTs of other CPUs.  Everything from the TSS onwards is unique per CPU.
> + */
> +__section(".data.page_aligned") __aligned(PAGE_SIZE)
> +seg_desc_t boot_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
> +{
> +    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64bit mode      */
> +    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0 data                  */
> +                                   /* 0xe018 - reserved                     */
> +    [ 4] = { 0x00cffa000000ffff }, /* 0xe023 - Ring 3 code, compatibility   */
> +    [ 5] = { 0x00cff2000000ffff }, /* 0xe02b - Ring 3 data                  */
> +    [ 6] = { 0x00affa000000ffff }, /* 0xe033 - Ring 3 code, 64-bit mode     */
> +    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code, compatibility   */
> +                                   /* 0xe040 - TSS                          */
> +                                   /* 0xe050 - LDT                          */
> +    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit == cpu) */
> +};
> +
> +__section(".data.page_aligned") __aligned(PAGE_SIZE)
> +seg_desc_t boot_compat_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
> +{
> +    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64-bit mode     */
> +    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0 data                  */
> +    [ 3] = { 0x00cfba000000ffff }, /* 0xe019 - Ring 1 code, compatibility   */
> +    [ 4] = { 0x00cfb2000000ffff }, /* 0xe021 - Ring 1 data                  */
> +    [ 5] = { 0x00cffa000000ffff }, /* 0xe02b - Ring 3 code, compatibility   */
> +    [ 6] = { 0x00cff2000000ffff }, /* 0xe033 - Ring 3 data                  */
> +    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code, compatibility   */
> +                                   /* 0xe040 - TSS                          */
> +                                   /* 0xe050 - LDT                          */
> +    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit == cpu) */
> +};

However unlikely it may be that we're going to change the order of
descriptors, I think there shouldn't be literal-number array indices
here. I think we want a local macro here to translate a selector (of
non-numeric form, e.g. __HYPERVISOR_CS64) to an array index. This way
adjustments to selector values that aren't part of the PV ABI (i.e.
anything from __HYPERVISOR_CS32 onwards) would be propagated here
right away. __HYPERVISOR_CS32 is a good example actually: We don't
seem to use it for anything, so we ought to be able to drop it. How
would one easily notice to also remove the initializer here without
the array index getting calculated from its (fundamental) selector?

While an orthogonal change - did you consider taking the opportunity
and set the accessed bits everywhere? Only the per-CPU entry has it
set right now.

Jan
Andrew Cooper Aug. 7, 2019, 10:46 a.m. UTC | #3
On 06/08/2019 16:48, Jan Beulich wrote:
> On 05.08.2019 14:43, Andrew Cooper wrote:
>> --- a/xen/arch/x86/boot/x86_64.S
>> +++ b/xen/arch/x86/boot/x86_64.S
>> @@ -43,44 +43,11 @@ ENTRY(__high_start)
>>   multiboot_ptr:
>>           .long   0
>>   -        .word   0
>> -GLOBAL(boot_gdtr)
>> -        .word   LAST_RESERVED_GDT_BYTE
>> -        .quad   boot_gdt - FIRST_RESERVED_GDT_BYTE
>
> Just as a remark: The intentional misalignment here is lost with
> the transition to C.

And it is used exactly once on each CPU.  I didn't even consider that
worth remarking on in the commit message.

>
>> --- /dev/null
>> +++ b/xen/arch/x86/desc.c
>> @@ -0,0 +1,75 @@
>> +#include <xen/types.h>
>> +#include <xen/lib.h>
>> +#include <xen/percpu.h>
>> +#include <xen/mm.h>
>> +
>> +#include <asm/desc.h>
>> +
>> +/*
>> + * Native and Compat GDTs used by Xen.
>> + *
>> + * The R1 and R3 descriptors are fixed in Xen's ABI for PV guests. 
>> All other
>> + * descriptors are in principle variable, with the following
>> restrictions.
>> + *
>> + * All R0 descriptors must line up in both GDTs to allow for correct
>> + * interrupt/exception handling.
>> + *
>> + * The SYSCALL/SYSRET GDT layout requires:
>> + *  - R0 long mode code followed by R0 data.
>> + *  - R3 compat code, followed by R3 data, followed by R3 long mode
>> code.
>> + *
>> + * The SYSENTER GDT layout requirements are compatible with
>> SYSCALL.  Xen does
>> + * not use the SYSEXIT instruction, and does not provide a
>> compatible GDT.
>> + *
>> + * These tables are used directly by CPU0, and used as the template
>> for the
>> + * GDTs of other CPUs.  Everything from the TSS onwards is unique
>> per CPU.
>> + */
>> +__section(".data.page_aligned") __aligned(PAGE_SIZE)
>> +seg_desc_t boot_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
>> +{
>> +    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64bit
>> mode      */
>> +    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0
>> data                  */
>> +                                   /* 0xe018 -
>> reserved                     */
>> +    [ 4] = { 0x00cffa000000ffff }, /* 0xe023 - Ring 3 code,
>> compatibility   */
>> +    [ 5] = { 0x00cff2000000ffff }, /* 0xe02b - Ring 3
>> data                  */
>> +    [ 6] = { 0x00affa000000ffff }, /* 0xe033 - Ring 3 code, 64-bit
>> mode     */
>> +    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code,
>> compatibility   */
>> +                                   /* 0xe040 -
>> TSS                          */
>> +                                   /* 0xe050 -
>> LDT                          */
>> +    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit
>> == cpu) */
>> +};
>> +
>> +__section(".data.page_aligned") __aligned(PAGE_SIZE)
>> +seg_desc_t boot_compat_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
>> +{
>> +    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64-bit
>> mode     */
>> +    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0
>> data                  */
>> +    [ 3] = { 0x00cfba000000ffff }, /* 0xe019 - Ring 1 code,
>> compatibility   */
>> +    [ 4] = { 0x00cfb2000000ffff }, /* 0xe021 - Ring 1
>> data                  */
>> +    [ 5] = { 0x00cffa000000ffff }, /* 0xe02b - Ring 3 code,
>> compatibility   */
>> +    [ 6] = { 0x00cff2000000ffff }, /* 0xe033 - Ring 3
>> data                  */
>> +    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code,
>> compatibility   */
>> +                                   /* 0xe040 -
>> TSS                          */
>> +                                   /* 0xe050 -
>> LDT                          */
>> +    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit
>> == cpu) */
>> +};
>
> However unlikely it may be that we're going to change the order of
> descriptors, I think there shouldn't be literal-number array indices
> here. I think we want a local macro here to translate a selector (of
> non-numeric form, e.g. __HYPERVISOR_CS64) to an array index.

I tried this, and then backtracked.  Our various constants are not in a
consistent-enough form to do this at this point.

More clean-up will come later, but as it stands, this is a
functionally-equivalent transform, and all I've got time for right now.

> This way
> adjustments to selector values that aren't part of the PV ABI (i.e.
> anything from __HYPERVISOR_CS32 onwards) would be propagated here
> right away. __HYPERVISOR_CS32 is a good example actually: We don't
> seem to use it for anything, so we ought to be able to drop it.

I did comment on this...

> How would one easily notice to also remove the initializer here without
> the array index getting calculated from its (fundamental) selector?
>
> While an orthogonal change - did you consider taking the opportunity
> and set the accessed bits everywhere? Only the per-CPU entry has it
> set right now.

I'd not even spotted that.  It should be broken out into an earlier
patch for backport.

~Andrew
Jan Beulich Aug. 7, 2019, 10:55 a.m. UTC | #4
On 07.08.2019 12:46, Andrew Cooper wrote:
> On 06/08/2019 16:48, Jan Beulich wrote:
>> On 05.08.2019 14:43, Andrew Cooper wrote:
>>> --- a/xen/arch/x86/boot/x86_64.S
>>> +++ b/xen/arch/x86/boot/x86_64.S
>>> @@ -43,44 +43,11 @@ ENTRY(__high_start)
>>>    multiboot_ptr:
>>>            .long   0
>>>    -        .word   0
>>> -GLOBAL(boot_gdtr)
>>> -        .word   LAST_RESERVED_GDT_BYTE
>>> -        .quad   boot_gdt - FIRST_RESERVED_GDT_BYTE
>>
>> Just as a remark: The intentional misalignment here is lost with
>> the transition to C.
> 
> And it is used exactly once on each CPU.  I didn't even consider that
> worth remarking on in the commit message.
> 
>>
>>> --- /dev/null
>>> +++ b/xen/arch/x86/desc.c
>>> @@ -0,0 +1,75 @@
>>> +#include <xen/types.h>
>>> +#include <xen/lib.h>
>>> +#include <xen/percpu.h>
>>> +#include <xen/mm.h>
>>> +
>>> +#include <asm/desc.h>
>>> +
>>> +/*
>>> + * Native and Compat GDTs used by Xen.
>>> + *
>>> + * The R1 and R3 descriptors are fixed in Xen's ABI for PV guests.
>>> All other
>>> + * descriptors are in principle variable, with the following
>>> restrictions.
>>> + *
>>> + * All R0 descriptors must line up in both GDTs to allow for correct
>>> + * interrupt/exception handling.
>>> + *
>>> + * The SYSCALL/SYSRET GDT layout requires:
>>> + *  - R0 long mode code followed by R0 data.
>>> + *  - R3 compat code, followed by R3 data, followed by R3 long mode
>>> code.
>>> + *
>>> + * The SYSENTER GDT layout requirements are compatible with
>>> SYSCALL.  Xen does
>>> + * not use the SYSEXIT instruction, and does not provide a
>>> compatible GDT.
>>> + *
>>> + * These tables are used directly by CPU0, and used as the template
>>> for the
>>> + * GDTs of other CPUs.  Everything from the TSS onwards is unique
>>> per CPU.
>>> + */
>>> +__section(".data.page_aligned") __aligned(PAGE_SIZE)
>>> +seg_desc_t boot_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
>>> +{
>>> +    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64bit
>>> mode      */
>>> +    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0
>>> data                  */
>>> +                                   /* 0xe018 -
>>> reserved                     */
>>> +    [ 4] = { 0x00cffa000000ffff }, /* 0xe023 - Ring 3 code,
>>> compatibility   */
>>> +    [ 5] = { 0x00cff2000000ffff }, /* 0xe02b - Ring 3
>>> data                  */
>>> +    [ 6] = { 0x00affa000000ffff }, /* 0xe033 - Ring 3 code, 64-bit
>>> mode     */
>>> +    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code,
>>> compatibility   */
>>> +                                   /* 0xe040 -
>>> TSS                          */
>>> +                                   /* 0xe050 -
>>> LDT                          */
>>> +    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit
>>> == cpu) */
>>> +};
>>> +
>>> +__section(".data.page_aligned") __aligned(PAGE_SIZE)
>>> +seg_desc_t boot_compat_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
>>> +{
>>> +    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64-bit
>>> mode     */
>>> +    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0
>>> data                  */
>>> +    [ 3] = { 0x00cfba000000ffff }, /* 0xe019 - Ring 1 code,
>>> compatibility   */
>>> +    [ 4] = { 0x00cfb2000000ffff }, /* 0xe021 - Ring 1
>>> data                  */
>>> +    [ 5] = { 0x00cffa000000ffff }, /* 0xe02b - Ring 3 code,
>>> compatibility   */
>>> +    [ 6] = { 0x00cff2000000ffff }, /* 0xe033 - Ring 3
>>> data                  */
>>> +    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code,
>>> compatibility   */
>>> +                                   /* 0xe040 -
>>> TSS                          */
>>> +                                   /* 0xe050 -
>>> LDT                          */
>>> +    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit
>>> == cpu) */
>>> +};
>>
>> However unlikely it may be that we're going to change the order of
>> descriptors, I think there shouldn't be literal-number array indices
>> here. I think we want a local macro here to translate a selector (of
>> non-numeric form, e.g. __HYPERVISOR_CS64) to an array index.
> 
> I tried this, and then backtracked.  Our various constants are not in a
> consistent-enough form to do this at this point.
> 
> More clean-up will come later, but as it stands, this is a
> functionally-equivalent transform,

Mostly, but specifically not for the gap between __HYPERVISOR_CS32
and PER_CPU_GDT_ENTRY.

> and all I've got time for right now.

Once the earlier 3 patches (assuming there's a dependency) have
gone in, would you mind me taking this and making another attempt?
That may convince me of your statement above, or result in fewer
hidden dependencies.

Jan
Andrew Cooper Aug. 7, 2019, 12:49 p.m. UTC | #5
On 07/08/2019 11:55, Jan Beulich wrote:
> On 07.08.2019 12:46, Andrew Cooper wrote:
>> On 06/08/2019 16:48, Jan Beulich wrote:
>>> On 05.08.2019 14:43, Andrew Cooper wrote:
>>>> --- a/xen/arch/x86/boot/x86_64.S
>>>> +++ b/xen/arch/x86/boot/x86_64.S
>>>> @@ -43,44 +43,11 @@ ENTRY(__high_start)
>>>>    multiboot_ptr:
>>>>            .long   0
>>>>    -        .word   0
>>>> -GLOBAL(boot_gdtr)
>>>> -        .word   LAST_RESERVED_GDT_BYTE
>>>> -        .quad   boot_gdt - FIRST_RESERVED_GDT_BYTE
>>>
>>> Just as a remark: The intentional misalignment here is lost with
>>> the transition to C.
>>
>> And it is used exactly once on each CPU.  I didn't even consider that
>> worth remarking on in the commit message.
>>
>>>
>>>> --- /dev/null
>>>> +++ b/xen/arch/x86/desc.c
>>>> @@ -0,0 +1,75 @@
>>>> +#include <xen/types.h>
>>>> +#include <xen/lib.h>
>>>> +#include <xen/percpu.h>
>>>> +#include <xen/mm.h>
>>>> +
>>>> +#include <asm/desc.h>
>>>> +
>>>> +/*
>>>> + * Native and Compat GDTs used by Xen.
>>>> + *
>>>> + * The R1 and R3 descriptors are fixed in Xen's ABI for PV guests.
>>>> All other
>>>> + * descriptors are in principle variable, with the following
>>>> restrictions.
>>>> + *
>>>> + * All R0 descriptors must line up in both GDTs to allow for correct
>>>> + * interrupt/exception handling.
>>>> + *
>>>> + * The SYSCALL/SYSRET GDT layout requires:
>>>> + *  - R0 long mode code followed by R0 data.
>>>> + *  - R3 compat code, followed by R3 data, followed by R3 long mode
>>>> code.
>>>> + *
>>>> + * The SYSENTER GDT layout requirements are compatible with
>>>> SYSCALL.  Xen does
>>>> + * not use the SYSEXIT instruction, and does not provide a
>>>> compatible GDT.
>>>> + *
>>>> + * These tables are used directly by CPU0, and used as the template
>>>> for the
>>>> + * GDTs of other CPUs.  Everything from the TSS onwards is unique
>>>> per CPU.
>>>> + */
>>>> +__section(".data.page_aligned") __aligned(PAGE_SIZE)
>>>> +seg_desc_t boot_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
>>>> +{
>>>> +    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64bit
>>>> mode      */
>>>> +    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0
>>>> data                  */
>>>> +                                   /* 0xe018 -
>>>> reserved                     */
>>>> +    [ 4] = { 0x00cffa000000ffff }, /* 0xe023 - Ring 3 code,
>>>> compatibility   */
>>>> +    [ 5] = { 0x00cff2000000ffff }, /* 0xe02b - Ring 3
>>>> data                  */
>>>> +    [ 6] = { 0x00affa000000ffff }, /* 0xe033 - Ring 3 code, 64-bit
>>>> mode     */
>>>> +    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code,
>>>> compatibility   */
>>>> +                                   /* 0xe040 -
>>>> TSS                          */
>>>> +                                   /* 0xe050 -
>>>> LDT                          */
>>>> +    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit
>>>> == cpu) */
>>>> +};
>>>> +
>>>> +__section(".data.page_aligned") __aligned(PAGE_SIZE)
>>>> +seg_desc_t boot_compat_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
>>>> +{
>>>> +    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64-bit
>>>> mode     */
>>>> +    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0
>>>> data                  */
>>>> +    [ 3] = { 0x00cfba000000ffff }, /* 0xe019 - Ring 1 code,
>>>> compatibility   */
>>>> +    [ 4] = { 0x00cfb2000000ffff }, /* 0xe021 - Ring 1
>>>> data                  */
>>>> +    [ 5] = { 0x00cffa000000ffff }, /* 0xe02b - Ring 3 code,
>>>> compatibility   */
>>>> +    [ 6] = { 0x00cff2000000ffff }, /* 0xe033 - Ring 3
>>>> data                  */
>>>> +    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code,
>>>> compatibility   */
>>>> +                                   /* 0xe040 -
>>>> TSS                          */
>>>> +                                   /* 0xe050 -
>>>> LDT                          */
>>>> +    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit
>>>> == cpu) */
>>>> +};
>>>
>>> However unlikely it may be that we're going to change the order of
>>> descriptors, I think there shouldn't be literal-number array indices
>>> here. I think we want a local macro here to translate a selector (of
>>> non-numeric form, e.g. __HYPERVISOR_CS64) to an array index.
>>
>> I tried this, and then backtracked.  Our various constants are not in a
>> consistent-enough form to do this at this point.
>>
>> More clean-up will come later, but as it stands, this is a
>> functionally-equivalent transform,
>
> Mostly, but specifically not for the gap between __HYPERVISOR_CS32
> and PER_CPU_GDT_ENTRY.
>
>> and all I've got time for right now.
>
> Once the earlier 3 patches (assuming there's a dependency) have
> gone in, would you mind me taking this and making another attempt?
> That may convince me of your statement above, or result in fewer
> hidden dependencies.

There are no functional dependencies between any patches in this series,
but a few minor textural ones.

I've just pushed the acked subset, which will address the minor textural
conflicts.  There will be a major conflict with the GDT Accessed patch,
but that will be easy to mechanically resolve.

~Andrew
diff mbox series

Patch

diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index 5e3840084b..2443fd2cc5 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -21,6 +21,7 @@  obj-$(CONFIG_PV) += compat.o x86_64/compat.o
 obj-$(CONFIG_KEXEC) += crash.o
 obj-y += debug.o
 obj-y += delay.o
+obj-y += desc.o
 obj-bin-y += dmi_scan.init.o
 obj-y += domctl.o
 obj-y += domain.o
diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index ab2d52a79d..782deac0d4 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -2,7 +2,6 @@ 
 #include <xen/multiboot2.h>
 #include <public/xen.h>
 #include <asm/asm_defns.h>
-#include <asm/desc.h>
 #include <asm/fixmap.h>
 #include <asm/page.h>
 #include <asm/processor.h>
diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S
index 3909363ca3..f762dfea11 100644
--- a/xen/arch/x86/boot/x86_64.S
+++ b/xen/arch/x86/boot/x86_64.S
@@ -43,44 +43,11 @@  ENTRY(__high_start)
 multiboot_ptr:
         .long   0
 
-        .word   0
-GLOBAL(boot_gdtr)
-        .word   LAST_RESERVED_GDT_BYTE
-        .quad   boot_gdt - FIRST_RESERVED_GDT_BYTE
-
 GLOBAL(stack_start)
         .quad   cpu0_stack
 
         .section .data.page_aligned, "aw", @progbits
         .align PAGE_SIZE, 0
-GLOBAL(boot_gdt)
-        .quad 0x0000000000000000     /* unused */
-        .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
-        .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
-        .quad 0x0000000000000000     /* reserved                          */
-        .quad 0x00cffa000000ffff     /* 0xe023 ring 3 code, compatibility */
-        .quad 0x00cff2000000ffff     /* 0xe02b ring 3 data                */
-        .quad 0x00affa000000ffff     /* 0xe033 ring 3 code, 64-bit mode   */
-        .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
-        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
-        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
-
-        .align PAGE_SIZE, 0
-/* NB. Even rings != 0 get access to the full 4Gb, as only the            */
-/*     (compatibility) machine->physical mapping table lives there.       */
-GLOBAL(boot_compat_gdt)
-        .quad 0x0000000000000000     /* unused */
-        .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
-        .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
-        .quad 0x00cfba000000ffff     /* 0xe019 ring 1 code, compatibility */
-        .quad 0x00cfb2000000ffff     /* 0xe021 ring 1 data                */
-        .quad 0x00cffa000000ffff     /* 0xe02b ring 3 code, compatibility */
-        .quad 0x00cff2000000ffff     /* 0xe033 ring 3 data                */
-        .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
-        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
-        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
-        .align PAGE_SIZE, 0
-
 /*
  * Mapping of first 2 megabytes of memory. This is mapped with 4kB mappings
  * to avoid type conflicts with fixed-range MTRRs covering the lowest megabyte
diff --git a/xen/arch/x86/desc.c b/xen/arch/x86/desc.c
new file mode 100644
index 0000000000..c5dad90cdd
--- /dev/null
+++ b/xen/arch/x86/desc.c
@@ -0,0 +1,75 @@ 
+#include <xen/types.h>
+#include <xen/lib.h>
+#include <xen/percpu.h>
+#include <xen/mm.h>
+
+#include <asm/desc.h>
+
+/*
+ * Native and Compat GDTs used by Xen.
+ *
+ * The R1 and R3 descriptors are fixed in Xen's ABI for PV guests.  All other
+ * descriptors are in principle variable, with the following restrictions.
+ *
+ * All R0 descriptors must line up in both GDTs to allow for correct
+ * interrupt/exception handling.
+ *
+ * The SYSCALL/SYSRET GDT layout requires:
+ *  - R0 long mode code followed by R0 data.
+ *  - R3 compat code, followed by R3 data, followed by R3 long mode code.
+ *
+ * The SYSENTER GDT layout requirements are compatible with SYSCALL.  Xen does
+ * not use the SYSEXIT instruction, and does not provide a compatible GDT.
+ *
+ * These tables are used directly by CPU0, and used as the template for the
+ * GDTs of other CPUs.  Everything from the TSS onwards is unique per CPU.
+ */
+__section(".data.page_aligned") __aligned(PAGE_SIZE)
+seg_desc_t boot_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
+{
+    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64bit mode      */
+    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0 data                  */
+                                   /* 0xe018 - reserved                     */
+    [ 4] = { 0x00cffa000000ffff }, /* 0xe023 - Ring 3 code, compatibility   */
+    [ 5] = { 0x00cff2000000ffff }, /* 0xe02b - Ring 3 data                  */
+    [ 6] = { 0x00affa000000ffff }, /* 0xe033 - Ring 3 code, 64-bit mode     */
+    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code, compatibility   */
+                                   /* 0xe040 - TSS                          */
+                                   /* 0xe050 - LDT                          */
+    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit == cpu) */
+};
+
+__section(".data.page_aligned") __aligned(PAGE_SIZE)
+seg_desc_t boot_compat_gdt[PAGE_SIZE / sizeof(seg_desc_t)] =
+{
+    [ 1] = { 0x00af9a000000ffff }, /* 0xe008 - Ring 0 code, 64-bit mode     */
+    [ 2] = { 0x00cf92000000ffff }, /* 0xe010 - Ring 0 data                  */
+    [ 3] = { 0x00cfba000000ffff }, /* 0xe019 - Ring 1 code, compatibility   */
+    [ 4] = { 0x00cfb2000000ffff }, /* 0xe021 - Ring 1 data                  */
+    [ 5] = { 0x00cffa000000ffff }, /* 0xe02b - Ring 3 code, compatibility   */
+    [ 6] = { 0x00cff2000000ffff }, /* 0xe033 - Ring 3 data                  */
+    [ 7] = { 0x00cf9a000000ffff }, /* 0xe038 - Ring 0 code, compatibility   */
+                                   /* 0xe040 - TSS                          */
+                                   /* 0xe050 - LDT                          */
+    [12] = { 0x0000910000000000 }, /* 0xe060 - per-CPU entry (limit == cpu) */
+};
+
+/*
+ * Used by each CPU as it starts up, to enter C with a suitable %cs.
+ * References boot_cpu_gdt_table for a short period, until the CPUs switch
+ * onto their per-CPU GDTs.
+ */
+struct desc_ptr boot_gdtr = {
+    .limit = LAST_RESERVED_GDT_BYTE,
+    .base = (unsigned long)(boot_gdt - FIRST_RESERVED_GDT_ENTRY),
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h
index 0be9348d29..0ebdcd05a9 100644
--- a/xen/include/asm-x86/desc.h
+++ b/xen/include/asm-x86/desc.h
@@ -103,10 +103,10 @@ 
 #define SYS_DESC_trap_gate    15
 
 typedef union {
+    uint64_t raw;
     struct {
         uint32_t a, b;
     };
-    uint64_t raw;
 } seg_desc_t;
 
 typedef union {