diff mbox

x86: Use entire page for the per-cpu GDT only if paravirt-enabled

Message ID 1443290440-14930-1-git-send-email-dvlasenk@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Denys Vlasenko Sept. 26, 2015, 6 p.m. UTC
We have our GDT in a page-sized per-cpu structure, gdt_page.

On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used.

It is page-sized because of paravirt. Hypervisors need to know when
GDT is changed, so they remap it read-only and handle write faults.
If it's not in its own page, other writes nearby will cause
those faults too.

In other words, we need GDT to live in a separate page
only if CONFIG_HYPERVISOR_GUEST=y.

This patch reduces GDT alignment to cacheline-aligned
if CONFIG_HYPERVISOR_GUEST is not set.

Patch also renames gdt_page to cpu_gdt (mimicking naming of existing
cpu_tss per-cpu variable), since now it is not always a full page.

    $ objdump -x vmlinux | grep .data..percpu | sort
Before:
    (offset)                                (size)
    0000000000000000  w    O .data..percpu  0000000000004000 irq_stack_union
    0000000000004000  w    O .data..percpu  0000000000005000 exception_stacks
    0000000000009000  w    O .data..percpu  0000000000001000 gdt_page  <<< HERE
    000000000000a000  w    O .data..percpu  0000000000000008 espfix_waddr
    000000000000a008  w    O .data..percpu  0000000000000008 espfix_stack
    ...
    0000000000019398 g       .data..percpu  0000000000000000 __per_cpu_end
After:
    0000000000000000  w    O .data..percpu  0000000000004000 irq_stack_union
    0000000000004000  w    O .data..percpu  0000000000005000 exception_stacks
    0000000000009000  w    O .data..percpu  0000000000000008 espfix_waddr
    0000000000009008  w    O .data..percpu  0000000000000008 espfix_stack
    ...
    0000000000013c80  w    O .data..percpu  0000000000000040 cyc2ns
    0000000000013cc0  w    O .data..percpu  00000000000022c0 cpu_tss
    0000000000015f80  w    O .data..percpu  0000000000000080 cpu_gdt  <<< HERE
    0000000000016000  w    O .data..percpu  0000000000000018 cpu_tlbstate
    ...
    0000000000018418 g       .data..percpu  0000000000000000 __per_cpu_end

Run-tested on a 144 CPU machine.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Ingo Molnar <mingo@kernel.org>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
CC: Boris Ostrovsky <boris.ostrovsky@oracle.com>
CC: David Vrabel <david.vrabel@citrix.com>
CC: Joerg Roedel <joro@8bytes.org>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: kvm@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/entry/entry_32.S        |  2 +-
 arch/x86/include/asm/desc.h      | 16 +++++++++++-----
 arch/x86/kernel/cpu/common.c     | 10 ++++++++--
 arch/x86/kernel/cpu/perf_event.c |  2 +-
 arch/x86/kernel/head_32.S        |  4 ++--
 arch/x86/kernel/head_64.S        |  2 +-
 arch/x86/kernel/vmlinux.lds.S    |  2 +-
 arch/x86/tools/relocs.c          |  2 +-
 arch/x86/xen/enlighten.c         |  4 ++--
 9 files changed, 28 insertions(+), 16 deletions(-)

Comments

H. Peter Anvin Sept. 26, 2015, 7:50 p.m. UTC | #1
NAK.  We really should map the GDT read-only on all 64 bit systems, since we can't hide the address from SLDT.  Same with the IDT.

On September 26, 2015 11:00:40 AM PDT, Denys Vlasenko <dvlasenk@redhat.com> wrote:
>We have our GDT in a page-sized per-cpu structure, gdt_page.
>
>On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used.
>
>It is page-sized because of paravirt. Hypervisors need to know when
>GDT is changed, so they remap it read-only and handle write faults.
>If it's not in its own page, other writes nearby will cause
>those faults too.
>
>In other words, we need GDT to live in a separate page
>only if CONFIG_HYPERVISOR_GUEST=y.
>
>This patch reduces GDT alignment to cacheline-aligned
>if CONFIG_HYPERVISOR_GUEST is not set.
>
>Patch also renames gdt_page to cpu_gdt (mimicking naming of existing
>cpu_tss per-cpu variable), since now it is not always a full page.
>
>    $ objdump -x vmlinux | grep .data..percpu | sort
>Before:
>    (offset)                                (size)
>0000000000000000  w    O .data..percpu  0000000000004000
>irq_stack_union
>0000000000004000  w    O .data..percpu  0000000000005000
>exception_stacks
>0000000000009000  w    O .data..percpu  0000000000001000 gdt_page  <<<
>HERE
>  000000000000a000  w    O .data..percpu  0000000000000008 espfix_waddr
>  000000000000a008  w    O .data..percpu  0000000000000008 espfix_stack
>    ...
> 0000000000019398 g       .data..percpu  0000000000000000 __per_cpu_end
>After:
>0000000000000000  w    O .data..percpu  0000000000004000
>irq_stack_union
>0000000000004000  w    O .data..percpu  0000000000005000
>exception_stacks
>  0000000000009000  w    O .data..percpu  0000000000000008 espfix_waddr
>  0000000000009008  w    O .data..percpu  0000000000000008 espfix_stack
>    ...
>    0000000000013c80  w    O .data..percpu  0000000000000040 cyc2ns
>    0000000000013cc0  w    O .data..percpu  00000000000022c0 cpu_tss
>0000000000015f80  w    O .data..percpu  0000000000000080 cpu_gdt  <<<
>HERE
>  0000000000016000  w    O .data..percpu  0000000000000018 cpu_tlbstate
>    ...
> 0000000000018418 g       .data..percpu  0000000000000000 __per_cpu_end
>
>Run-tested on a 144 CPU machine.
>
>Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
>CC: Ingo Molnar <mingo@kernel.org>
>CC: H. Peter Anvin <hpa@zytor.com>
>CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>CC: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>CC: David Vrabel <david.vrabel@citrix.com>
>CC: Joerg Roedel <joro@8bytes.org>
>CC: Gleb Natapov <gleb@kernel.org>
>CC: Paolo Bonzini <pbonzini@redhat.com>
>CC: kvm@vger.kernel.org
>CC: x86@kernel.org
>CC: linux-kernel@vger.kernel.org
>---
> arch/x86/entry/entry_32.S        |  2 +-
> arch/x86/include/asm/desc.h      | 16 +++++++++++-----
> arch/x86/kernel/cpu/common.c     | 10 ++++++++--
> arch/x86/kernel/cpu/perf_event.c |  2 +-
> arch/x86/kernel/head_32.S        |  4 ++--
> arch/x86/kernel/head_64.S        |  2 +-
> arch/x86/kernel/vmlinux.lds.S    |  2 +-
> arch/x86/tools/relocs.c          |  2 +-
> arch/x86/xen/enlighten.c         |  4 ++--
> 9 files changed, 28 insertions(+), 16 deletions(-)
>
>diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
>index b2909bf..bc6ae1c 100644
>--- a/arch/x86/entry/entry_32.S
>+++ b/arch/x86/entry/entry_32.S
>@@ -429,7 +429,7 @@ ldt_ss:
>  * compensating for the offset by changing to the ESPFIX segment with
>  * a base address that matches for the difference.
>  */
>-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS *
>8)
>+#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8)
> 	mov	%esp, %edx			/* load kernel esp */
> 	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
> 	mov	%dx, %ax			/* eax: new kernel esp */
>diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
>index 4e10d73..76de300 100644
>--- a/arch/x86/include/asm/desc.h
>+++ b/arch/x86/include/asm/desc.h
>@@ -39,15 +39,21 @@ extern gate_desc idt_table[];
> extern struct desc_ptr debug_idt_descr;
> extern gate_desc debug_idt_table[];
> 
>-struct gdt_page {
>+struct cpu_gdt {
> 	struct desc_struct gdt[GDT_ENTRIES];
>-} __attribute__((aligned(PAGE_SIZE)));
>-
>-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
>+}
>+#ifdef CONFIG_HYPERVISOR_GUEST
>+/* Xen et al want GDT to have its own page. They remap it read-only */
>+__attribute__((aligned(PAGE_SIZE)));
>+DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt);
>+#else
>+____cacheline_aligned;
>+DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt);
>+#endif
> 
> static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
> {
>-	return per_cpu(gdt_page, cpu).gdt;
>+	return per_cpu(cpu_gdt, cpu).gdt;
> }
> 
> #ifdef CONFIG_X86_64
>diff --git a/arch/x86/kernel/cpu/common.c
>b/arch/x86/kernel/cpu/common.c
>index de22ea7..6b90785 100644
>--- a/arch/x86/kernel/cpu/common.c
>+++ b/arch/x86/kernel/cpu/common.c
>@@ -92,7 +92,13 @@ static const struct cpu_dev default_cpu = {
> 
> static const struct cpu_dev *this_cpu = &default_cpu;
> 
>-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
>+#ifdef CONFIG_HYPERVISOR_GUEST
>+/* Xen et al want GDT to have its own page. They remap it read-only */
>+DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) =
>+#else
>+DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) =
>+#endif
>+{ .gdt = {
> #ifdef CONFIG_X86_64
> 	/*
> 	 * We need valid kernel segments for data and code in long mode too
>@@ -144,7 +150,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page,
>gdt_page) = { .gdt = {
> 	GDT_STACK_CANARY_INIT
> #endif
> } };
>-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
>+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
> 
> static int __init x86_mpx_setup(char *s)
> {
>diff --git a/arch/x86/kernel/cpu/perf_event.c
>b/arch/x86/kernel/cpu/perf_event.c
>index 66dd3fe9..41ebc94 100644
>--- a/arch/x86/kernel/cpu/perf_event.c
>+++ b/arch/x86/kernel/cpu/perf_event.c
>@@ -2198,7 +2198,7 @@ static unsigned long get_segment_base(unsigned
>int segment)
> 		if (idx > GDT_ENTRIES)
> 			return 0;
> 
>-		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
>+		desc = raw_cpu_ptr(cpu_gdt.gdt) + idx;
> 	}
> 
> 	return get_desc_base(desc);
>diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
>index 0e2d96f..33c9d10 100644
>--- a/arch/x86/kernel/head_32.S
>+++ b/arch/x86/kernel/head_32.S
>@@ -521,7 +521,7 @@ setup_once:
> 	 * relocation.  Manually set base address in stack canary
> 	 * segment descriptor.
> 	 */
>-	movl $gdt_page,%eax
>+	movl $cpu_gdt,%eax
> 	movl $stack_canary,%ecx
> 	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
> 	shrl $16, %ecx
>@@ -758,7 +758,7 @@ idt_descr:
> 	.word 0				# 32 bit align gdt_desc.address
> ENTRY(early_gdt_descr)
> 	.word GDT_ENTRIES*8-1
>-	.long gdt_page			/* Overwritten for secondary CPUs */
>+	.long cpu_gdt			/* Overwritten for secondary CPUs */
> 
> /*
>  * The boot_gdt must mirror the equivalent in setup.S and is
>diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
>index 1d40ca8..5a80e8c 100644
>--- a/arch/x86/kernel/head_64.S
>+++ b/arch/x86/kernel/head_64.S
>@@ -510,7 +510,7 @@ NEXT_PAGE(level1_fixmap_pgt)
> early_gdt_descr:
> 	.word	GDT_ENTRIES*8-1
> early_gdt_descr_base:
>-	.quad	INIT_PER_CPU_VAR(gdt_page)
>+	.quad	INIT_PER_CPU_VAR(cpu_gdt)
> 
> ENTRY(phys_base)
> 	/* This must match the first entry in level2_kernel_pgt */
>diff --git a/arch/x86/kernel/vmlinux.lds.S
>b/arch/x86/kernel/vmlinux.lds.S
>index 74e4bf1..198e46b 100644
>--- a/arch/x86/kernel/vmlinux.lds.S
>+++ b/arch/x86/kernel/vmlinux.lds.S
>@@ -348,7 +348,7 @@ SECTIONS
>  * for the boot processor.
>  */
> #define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
>-INIT_PER_CPU(gdt_page);
>+INIT_PER_CPU(cpu_gdt);
> INIT_PER_CPU(irq_stack_union);
> 
> /*
>diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
>index 0c2fae8..5a1da82 100644
>--- a/arch/x86/tools/relocs.c
>+++ b/arch/x86/tools/relocs.c
>@@ -736,7 +736,7 @@ static void percpu_init(void)
>  *
>  * The "gold" linker incorrectly associates:
>  *	init_per_cpu__irq_stack_union
>- *	init_per_cpu__gdt_page
>+ *	init_per_cpu__cpu_gdt
>  */
> static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
> {
>diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>index 30d12af..baecfc6 100644
>--- a/arch/x86/xen/enlighten.c
>+++ b/arch/x86/xen/enlighten.c
>@@ -1592,10 +1592,10 @@ asmlinkage __visible void __init
>xen_start_kernel(void)
> 
> 	/*
> 	 * The only reliable way to retain the initial address of the
>-	 * percpu gdt_page is to remember it here, so we can go and
>+	 * percpu cpu_gdt is to remember it here, so we can go and
> 	 * mark it RW later, when the initial percpu area is freed.
> 	 */
>-	xen_initial_gdt = &per_cpu(gdt_page, 0);
>+	xen_initial_gdt = &per_cpu(cpu_gdt, 0);
> 
> 	xen_smp_init();
>
Denys Vlasenko Sept. 26, 2015, 8:38 p.m. UTC | #2
On 09/26/2015 09:50 PM, H. Peter Anvin wrote:
> NAK.  We really should map the GDT read-only on all 64 bit systems,
> since we can't hide the address from SLDT.  Same with the IDT.

Sorry, I don't understand your point.


> On September 26, 2015 11:00:40 AM PDT, Denys Vlasenko <dvlasenk@redhat.com> wrote:
>> We have our GDT in a page-sized per-cpu structure, gdt_page.
>>
>> On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used.
>>
>> It is page-sized because of paravirt. Hypervisors need to know when
>> GDT is changed, so they remap it read-only and handle write faults.
>> If it's not in its own page, other writes nearby will cause
>> those faults too.
>>
>> In other words, we need GDT to live in a separate page
>> only if CONFIG_HYPERVISOR_GUEST=y.
>>
>> This patch reduces GDT alignment to cacheline-aligned
>> if CONFIG_HYPERVISOR_GUEST is not set.
>>
>> Patch also renames gdt_page to cpu_gdt (mimicking naming of existing
>> cpu_tss per-cpu variable), since now it is not always a full page.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar Sept. 28, 2015, 7:58 a.m. UTC | #3
* Denys Vlasenko <dvlasenk@redhat.com> wrote:

> On 09/26/2015 09:50 PM, H. Peter Anvin wrote:
> > NAK.  We really should map the GDT read-only on all 64 bit systems,
> > since we can't hide the address from SLDT.  Same with the IDT.
> 
> Sorry, I don't understand your point.

So the problem is that right now the SGDT instruction (which is unprivileged) 
leaks the real address of the kernel image:

 fomalhaut:~> ./sgdt 
 SGDT: ffff88303fd89000 / 007f

that 'ffff88303fd89000' is a kernel address.

 fomalhaut:~> cat sgdt.c 
 #include <stdio.h>
 #include <stdlib.h>

 int main(void)
 {
         struct gdt_desc {
                 unsigned short  limit;
                 unsigned long   addr;
         } __attribute__((packed)) gdt_desc = { -1, -1 };

         asm volatile("sgdt %0": "=m" (gdt_desc));

         printf("SGDT: %016lx / %04x\n", gdt_desc.addr, gdt_desc.limit);

         return 0;
 }

Your observation in the changelog and your patch:

> >> It is page-sized because of paravirt. [...]

... conflicts with the intention to mark (remap) the primary GDT address read-only 
on native kernels as well.

So what we should do instead is to use the page alignment properly and remap the 
GDT to a read-only location, and load that one.

This would have a couple of advantages:

 - This would give kernel address randomization more teeth on x86.

 - An additional advantage would be that rootkits overwriting the GDT would have 
   a bit more work to do.

 - A third advantage would be that for NUMA systems we could 'mirror' the GDT into
   node-local memory and load those. This makes GDT load cache-misses a bit less
   expensive.

The IDT is already remapped:

 fomalhaut:~> ./sidt 
 Sidt: ffffffffff57b000 / 0fff
 fomalhaut:~> cat sidt.c
 #include <stdio.h>
 #include <stdlib.h>

 int main(void)
 {
         struct idt_desc {
                 unsigned short  limit;
                 unsigned long   addr;
         } __attribute__((packed)) idt_desc = { -1, -1 };

         asm volatile("sidt %0": "=m" (idt_desc));

         printf("Sidt: %016lx / %04x\n", idt_desc.addr, idt_desc.limit);

         return 0;
 }

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Denys Vlasenko Sept. 28, 2015, 12:45 p.m. UTC | #4
On 09/28/2015 09:58 AM, Ingo Molnar wrote:
> 
> * Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 
>> On 09/26/2015 09:50 PM, H. Peter Anvin wrote:
>>> NAK.  We really should map the GDT read-only on all 64 bit systems,
>>> since we can't hide the address from SLDT.  Same with the IDT.
>>
>> Sorry, I don't understand your point.
> 
> So the problem is that right now the SGDT instruction (which is unprivileged) 
> leaks the real address of the kernel image:
> 
>  fomalhaut:~> ./sgdt 
>  SGDT: ffff88303fd89000 / 007f
> 
> that 'ffff88303fd89000' is a kernel address.

Thank you.
I do know that SGDT and friends are unprivileged on x86
and thus they allow userspace (and guest kernels in paravirt)
learn things they don't need to know.

I don't see how making GDT page-aligned and page-sized
changes anything in this regard. SGDT will still work,
and still leak GDT address.

> Your observation in the changelog and your patch:
> 
>>>> It is page-sized because of paravirt. [...]
> 
> ... conflicts with the intention to mark (remap) the primary GDT address read-only 
> on native kernels as well.
> 
> So what we should do instead is to use the page alignment properly and remap the 
> GDT to a read-only location, and load that one.

If we'd have a small GDT (i.e. what my patch does), we still can remap the entire page
which contains small GDT, and simply don't care that some other data is also visible
through that RO page.

> This would have a couple of advantages:
> 
>  - This would give kernel address randomization more teeth on x86.
> 
>  - An additional advantage would be that rootkits overwriting the GDT would have 
>    a bit more work to do.
> 
>  - A third advantage would be that for NUMA systems we could 'mirror' the GDT into
>    node-local memory and load those. This makes GDT load cache-misses a bit less
>    expensive.

GDT is per-cpu. Isn't per-cpu memory already NUMA-local?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar Sept. 29, 2015, 9:01 a.m. UTC | #5
* Denys Vlasenko <dvlasenk@redhat.com> wrote:

> On 09/28/2015 09:58 AM, Ingo Molnar wrote:
> > 
> > * Denys Vlasenko <dvlasenk@redhat.com> wrote:
> > 
> >> On 09/26/2015 09:50 PM, H. Peter Anvin wrote:
> >>> NAK.  We really should map the GDT read-only on all 64 bit systems,
> >>> since we can't hide the address from SLDT.  Same with the IDT.
> >>
> >> Sorry, I don't understand your point.
> > 
> > So the problem is that right now the SGDT instruction (which is unprivileged) 
> > leaks the real address of the kernel image:
> > 
> >  fomalhaut:~> ./sgdt 
> >  SGDT: ffff88303fd89000 / 007f
> > 
> > that 'ffff88303fd89000' is a kernel address.
> 
> Thank you.
> I do know that SGDT and friends are unprivileged on x86
> and thus they allow userspace (and guest kernels in paravirt)
> learn things they don't need to know.
> 
> I don't see how making GDT page-aligned and page-sized
> changes anything in this regard. SGDT will still work,
> and still leak GDT address.

Well, as I try to explain it in the other part of my mail, doing so enables us to 
remap the GDT to a less security sensitive virtual address that does not leak the 
kernel's randomized address:

> > Your observation in the changelog and your patch:
> > 
> >>>> It is page-sized because of paravirt. [...]
> > 
> > ... conflicts with the intention to mark (remap) the primary GDT address read-only 
> > on native kernels as well.
> > 
> > So what we should do instead is to use the page alignment properly and remap the 
> > GDT to a read-only location, and load that one.
> 
> If we'd have a small GDT (i.e. what my patch does), we still can remap the 
> entire page which contains small GDT, and simply don't care that some other data 
> is also visible through that RO page.

That's generally considered fragile: suppose an attacker has a limited information 
leak that can read absolute addresses with system privilege but he doesn't know 
the kernel's randomized base offset. With a 'partial page' mapping there could be 
function pointers near the GDT, part of the page the GDT happens to be on, that 
leak this information.

(Same goes for crypto keys or other critical information (like canary information, 
salts, etc.) accidentally ending up nearby.)

Arguably it's a bit tenuous, but when playing remapping games it's generally 
considered good to be page aligned and page sized, with zero padding.

> > This would have a couple of advantages:
> > 
> >  - This would give kernel address randomization more teeth on x86.
> > 
> >  - An additional advantage would be that rootkits overwriting the GDT would have 
> >    a bit more work to do.
> > 
> >  - A third advantage would be that for NUMA systems we could 'mirror' the GDT into
> >    node-local memory and load those. This makes GDT load cache-misses a bit less
> >    expensive.
> 
> GDT is per-cpu. Isn't per-cpu memory already NUMA-local?

Indeed it is:

fomalhaut:~> for ((cpu=1; cpu<9; cpu++)); do taskset $cpu ./sgdt ; done
SGDT: ffff88103fa09000 / 007f
SGDT: ffff88103fa29000 / 007f
SGDT: ffff88103fa29000 / 007f
SGDT: ffff88103fa49000 / 007f
SGDT: ffff88103fa49000 / 007f
SGDT: ffff88103fa49000 / 007f
SGDT: ffff88103fa29000 / 007f
SGDT: ffff88103fa69000 / 007f

I confused it with the IDT, which is still global.

This also means that the GDT in itself does not leak kernel addresses at the 
moment, except it leaks the layout of the percpu area.

So my suggestion would be to:

 - make the GDT unconditionally page aligned and sized, then remap it to a
   read-only address unconditionally as well, like we do it for the IDT.

 - make the IDT per CPU as well, for performance reasons.

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Sept. 29, 2015, 5:35 p.m. UTC | #6
On Sep 29, 2015 2:01 AM, "Ingo Molnar" <mingo@kernel.org> wrote:
>
>
> * Denys Vlasenko <dvlasenk@redhat.com> wrote:
>
> > On 09/28/2015 09:58 AM, Ingo Molnar wrote:
> > >
> > > * Denys Vlasenko <dvlasenk@redhat.com> wrote:
> > >
> > >> On 09/26/2015 09:50 PM, H. Peter Anvin wrote:
> > >>> NAK.  We really should map the GDT read-only on all 64 bit systems,
> > >>> since we can't hide the address from SLDT.  Same with the IDT.
> > >>
> > >> Sorry, I don't understand your point.
> > >
> > > So the problem is that right now the SGDT instruction (which is unprivileged)
> > > leaks the real address of the kernel image:
> > >
> > >  fomalhaut:~> ./sgdt
> > >  SGDT: ffff88303fd89000 / 007f
> > >
> > > that 'ffff88303fd89000' is a kernel address.
> >
> > Thank you.
> > I do know that SGDT and friends are unprivileged on x86
> > and thus they allow userspace (and guest kernels in paravirt)
> > learn things they don't need to know.
> >
> > I don't see how making GDT page-aligned and page-sized
> > changes anything in this regard. SGDT will still work,
> > and still leak GDT address.
>
> Well, as I try to explain it in the other part of my mail, doing so enables us to
> remap the GDT to a less security sensitive virtual address that does not leak the
> kernel's randomized address:
>
> > > Your observation in the changelog and your patch:
> > >
> > >>>> It is page-sized because of paravirt. [...]
> > >
> > > ... conflicts with the intention to mark (remap) the primary GDT address read-only
> > > on native kernels as well.
> > >
> > > So what we should do instead is to use the page alignment properly and remap the
> > > GDT to a read-only location, and load that one.
> >
> > If we'd have a small GDT (i.e. what my patch does), we still can remap the
> > entire page which contains small GDT, and simply don't care that some other data
> > is also visible through that RO page.
>
> That's generally considered fragile: suppose an attacker has a limited information
> leak that can read absolute addresses with system privilege but he doesn't know
> the kernel's randomized base offset. With a 'partial page' mapping there could be
> function pointers near the GDT, part of the page the GDT happens to be on, that
> leak this information.
>
> (Same goes for crypto keys or other critical information (like canary information,
> salts, etc.) accidentally ending up nearby.)
>
> Arguably it's a bit tenuous, but when playing remapping games it's generally
> considered good to be page aligned and page sized, with zero padding.
>
> > > This would have a couple of advantages:
> > >
> > >  - This would give kernel address randomization more teeth on x86.
> > >
> > >  - An additional advantage would be that rootkits overwriting the GDT would have
> > >    a bit more work to do.
> > >
> > >  - A third advantage would be that for NUMA systems we could 'mirror' the GDT into
> > >    node-local memory and load those. This makes GDT load cache-misses a bit less
> > >    expensive.
> >
> > GDT is per-cpu. Isn't per-cpu memory already NUMA-local?
>
> Indeed it is:
>
> fomalhaut:~> for ((cpu=1; cpu<9; cpu++)); do taskset $cpu ./sgdt ; done
> SGDT: ffff88103fa09000 / 007f
> SGDT: ffff88103fa29000 / 007f
> SGDT: ffff88103fa29000 / 007f
> SGDT: ffff88103fa49000 / 007f
> SGDT: ffff88103fa49000 / 007f
> SGDT: ffff88103fa49000 / 007f
> SGDT: ffff88103fa29000 / 007f
> SGDT: ffff88103fa69000 / 007f
>
> I confused it with the IDT, which is still global.
>
> This also means that the GDT in itself does not leak kernel addresses at the
> moment, except it leaks the layout of the percpu area.
>
> So my suggestion would be to:
>
>  - make the GDT unconditionally page aligned and sized, then remap it to a
>    read-only address unconditionally as well, like we do it for the IDT.

Does anyone know what happens if you stick a non-accessed segment in
the GDT, map the GDT RO, and access it?  The docs are extremely vague
on the interplay between segmentation and paging on the segmentation
structures themselves.  My guess is that it causes #PF.  This might
break set_thread_area users unless we change set_thread_area to force
the accessed bit on.

There's a possible worse failure mode: if someone pokes an un-accessed
segment into SS or CS using sigreturn, then it's within the realm of
possibility that IRET would generate #PF (hey Intel and AMD, please
document this!).  I don't think that would be rootable, but at the
very least we'd want to make sure it doesn't OOPS by either making it
impossible or adding an explicit test to sigreturn.c.

hpa pointed out in another thread that the GDT *must* be writable on
32-bit kernels because we use a task gate for NMI and jumping through
a task gate writes to the GDT.

On another note, SGDT is considerably faster than LSL, at least on
Sandy Bridge.  The vdso might be able to take advantage of that for
getcpu.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Torvalds Sept. 29, 2015, 5:50 p.m. UTC | #7
On Tue, Sep 29, 2015 at 1:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>
> Does anyone know what happens if you stick a non-accessed segment in
> the GDT, map the GDT RO, and access it?

You should get a #PF, as you guess, but go ahead and test it if you
want to make sure.

We do something very similar for the old Pentium F0 0F bug - we mark
the IDT read-only, which causes the (bogus) locked read of the IDT
entry that the F00F bug resulted in to be caught as a page fault
instead.

              Linus
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski Sept. 29, 2015, 6:02 p.m. UTC | #8
On Tue, Sep 29, 2015 at 10:50 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Tue, Sep 29, 2015 at 1:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>
>> Does anyone know what happens if you stick a non-accessed segment in
>> the GDT, map the GDT RO, and access it?
>
> You should get a #PF, as you guess, but go ahead and test it if you
> want to make sure.
>

Then I think that, if we do this, the patch series should first make
it percpu and fixmapped but RW and then flip it RO as a separate patch
in case we need to revert the actual RO bit.  I don't want to break
Wine or The Witcher 2 because of this, and we might need various
fixups.  I really hope that no one uses get_thread_area to check
whether TLS has been accessed.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin Sept. 29, 2015, 6:18 p.m. UTC | #9
SGDT would be easy to use, and it is logical that it is faster since it reads an internal register.  SIDT does too but unlike the GDT has a secondary limit (it can never be larger than 4096 bytes) and so all limits in the range 4095-65535 are exactly equivalent.

Anything that causes a write to the GDT will #PF if read-only.  So yes, we need to force the accessed bit to set.  This shouldn't be a problem and in fact ought to be a performance improvement.

On September 29, 2015 10:35:38 AM PDT, Andy Lutomirski <luto@amacapital.net> wrote:
>On Sep 29, 2015 2:01 AM, "Ingo Molnar" <mingo@kernel.org> wrote:
>>
>>
>> * Denys Vlasenko <dvlasenk@redhat.com> wrote:
>>
>> > On 09/28/2015 09:58 AM, Ingo Molnar wrote:
>> > >
>> > > * Denys Vlasenko <dvlasenk@redhat.com> wrote:
>> > >
>> > >> On 09/26/2015 09:50 PM, H. Peter Anvin wrote:
>> > >>> NAK.  We really should map the GDT read-only on all 64 bit
>systems,
>> > >>> since we can't hide the address from SLDT.  Same with the IDT.
>> > >>
>> > >> Sorry, I don't understand your point.
>> > >
>> > > So the problem is that right now the SGDT instruction (which is
>unprivileged)
>> > > leaks the real address of the kernel image:
>> > >
>> > >  fomalhaut:~> ./sgdt
>> > >  SGDT: ffff88303fd89000 / 007f
>> > >
>> > > that 'ffff88303fd89000' is a kernel address.
>> >
>> > Thank you.
>> > I do know that SGDT and friends are unprivileged on x86
>> > and thus they allow userspace (and guest kernels in paravirt)
>> > learn things they don't need to know.
>> >
>> > I don't see how making GDT page-aligned and page-sized
>> > changes anything in this regard. SGDT will still work,
>> > and still leak GDT address.
>>
>> Well, as I try to explain it in the other part of my mail, doing so
>enables us to
>> remap the GDT to a less security sensitive virtual address that does
>not leak the
>> kernel's randomized address:
>>
>> > > Your observation in the changelog and your patch:
>> > >
>> > >>>> It is page-sized because of paravirt. [...]
>> > >
>> > > ... conflicts with the intention to mark (remap) the primary GDT
>address read-only
>> > > on native kernels as well.
>> > >
>> > > So what we should do instead is to use the page alignment
>properly and remap the
>> > > GDT to a read-only location, and load that one.
>> >
>> > If we'd have a small GDT (i.e. what my patch does), we still can
>remap the
>> > entire page which contains small GDT, and simply don't care that
>some other data
>> > is also visible through that RO page.
>>
>> That's generally considered fragile: suppose an attacker has a
>limited information
>> leak that can read absolute addresses with system privilege but he
>doesn't know
>> the kernel's randomized base offset. With a 'partial page' mapping
>there could be
>> function pointers near the GDT, part of the page the GDT happens to
>be on, that
>> leak this information.
>>
>> (Same goes for crypto keys or other critical information (like canary
>information,
>> salts, etc.) accidentally ending up nearby.)
>>
>> Arguably it's a bit tenuous, but when playing remapping games it's
>generally
>> considered good to be page aligned and page sized, with zero padding.
>>
>> > > This would have a couple of advantages:
>> > >
>> > >  - This would give kernel address randomization more teeth on
>x86.
>> > >
>> > >  - An additional advantage would be that rootkits overwriting the
>GDT would have
>> > >    a bit more work to do.
>> > >
>> > >  - A third advantage would be that for NUMA systems we could
>'mirror' the GDT into
>> > >    node-local memory and load those. This makes GDT load
>cache-misses a bit less
>> > >    expensive.
>> >
>> > GDT is per-cpu. Isn't per-cpu memory already NUMA-local?
>>
>> Indeed it is:
>>
>> fomalhaut:~> for ((cpu=1; cpu<9; cpu++)); do taskset $cpu ./sgdt ;
>done
>> SGDT: ffff88103fa09000 / 007f
>> SGDT: ffff88103fa29000 / 007f
>> SGDT: ffff88103fa29000 / 007f
>> SGDT: ffff88103fa49000 / 007f
>> SGDT: ffff88103fa49000 / 007f
>> SGDT: ffff88103fa49000 / 007f
>> SGDT: ffff88103fa29000 / 007f
>> SGDT: ffff88103fa69000 / 007f
>>
>> I confused it with the IDT, which is still global.
>>
>> This also means that the GDT in itself does not leak kernel addresses
>at the
>> moment, except it leaks the layout of the percpu area.
>>
>> So my suggestion would be to:
>>
>>  - make the GDT unconditionally page aligned and sized, then remap it
>to a
>>    read-only address unconditionally as well, like we do it for the
>IDT.
>
>Does anyone know what happens if you stick a non-accessed segment in
>the GDT, map the GDT RO, and access it?  The docs are extremely vague
>on the interplay between segmentation and paging on the segmentation
>structures themselves.  My guess is that it causes #PF.  This might
>break set_thread_area users unless we change set_thread_area to force
>the accessed bit on.
>
>There's a possible worse failure mode: if someone pokes an un-accessed
>segment into SS or CS using sigreturn, then it's within the realm of
>possibility that IRET would generate #PF (hey Intel and AMD, please
>document this!).  I don't think that would be rootable, but at the
>very least we'd want to make sure it doesn't OOPS by either making it
>impossible or adding an explicit test to sigreturn.c.
>
>hpa pointed out in another thread that the GDT *must* be writable on
>32-bit kernels because we use a task gate for NMI and jumping through
>a task gate writes to the GDT.
>
>On another note, SGDT is considerably faster than LSL, at least on
>Sandy Bridge.  The vdso might be able to take advantage of that for
>getcpu.
>
>--Andy
Andy Lutomirski Sept. 29, 2015, 6:22 p.m. UTC | #10
On Tue, Sep 29, 2015 at 11:18 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> SGDT would be easy to use, and it is logical that it is faster since it reads an internal register.  SIDT does too but unlike the GDT has a secondary limit (it can never be larger than 4096 bytes) and so all limits in the range 4095-65535 are exactly equivalent.
>

Using the IDT limit would have been a great ideal if Intel hadn't
decided to clobber it on every VM exit.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin Sept. 29, 2015, 6:27 p.m. UTC | #11
Ugh.  Didn't realize that.

On September 29, 2015 11:22:04 AM PDT, Andy Lutomirski <luto@amacapital.net> wrote:
>On Tue, Sep 29, 2015 at 11:18 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> SGDT would be easy to use, and it is logical that it is faster since
>it reads an internal register.  SIDT does too but unlike the GDT has a
>secondary limit (it can never be larger than 4096 bytes) and so all
>limits in the range 4095-65535 are exactly equivalent.
>>
>
>Using the IDT limit would have been a great ideal if Intel hadn't
>decided to clobber it on every VM exit.
>
>--Andy
H. Peter Anvin Sept. 29, 2015, 8:30 p.m. UTC | #12
On 09/29/2015 11:02 AM, Andy Lutomirski wrote:
> On Tue, Sep 29, 2015 at 10:50 AM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>> On Tue, Sep 29, 2015 at 1:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>
>>> Does anyone know what happens if you stick a non-accessed segment in
>>> the GDT, map the GDT RO, and access it?
>>
>> You should get a #PF, as you guess, but go ahead and test it if you
>> want to make sure.
> 
> Then I think that, if we do this, the patch series should first make
> it percpu and fixmapped but RW and then flip it RO as a separate patch
> in case we need to revert the actual RO bit.  I don't want to break
> Wine or The Witcher 2 because of this, and we might need various
> fixups.  I really hope that no one uses get_thread_area to check
> whether TLS has been accessed.
> 

Of course.

	-hpa


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Sept. 30, 2015, 1:20 a.m. UTC | #13
Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Tue, Sep 29, 2015 at 1:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>
>> Does anyone know what happens if you stick a non-accessed segment in
>> the GDT, map the GDT RO, and access it?
>
> You should get a #PF, as you guess, but go ahead and test it if you
> want to make sure.

I tested this by accident once when workinng on what has become known
as coreboot.  Early in boot with your GDT in a EEPROM switching from
real mode to 32bit protected mode causes a write and locks up the
machine when the hardware declines the write to the GDT to set the
accessed bit.  As I recall the write kept being retried and retried and
retried...

Setting the access bit in the GDT cleared up the problem and I did not
look back.

Way up in 64bit mode something might be different, but I don't know why
cpu designeres would waste the silicon.

Eric
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin Sept. 30, 2015, 1:31 a.m. UTC | #14
On 09/29/2015 06:20 PM, Eric W. Biederman wrote:
> Linus Torvalds <torvalds@linux-foundation.org> writes:
> 
>> On Tue, Sep 29, 2015 at 1:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>
>>> Does anyone know what happens if you stick a non-accessed segment in
>>> the GDT, map the GDT RO, and access it?
>>
>> You should get a #PF, as you guess, but go ahead and test it if you
>> want to make sure.
> 
> I tested this by accident once when workinng on what has become known
> as coreboot.  Early in boot with your GDT in a EEPROM switching from
> real mode to 32bit protected mode causes a write and locks up the
> machine when the hardware declines the write to the GDT to set the
> accessed bit.  As I recall the write kept being retried and retried and
> retried...
> 
> Setting the access bit in the GDT cleared up the problem and I did not
> look back.
> 
> Way up in 64bit mode something might be different, but I don't know why
> cpu designeres would waste the silicon.
> 

This is totally different from a TLB violation.  In your case, the write
goes through as far as the CPU is concerned, but when the data is
fetched back, it hasn't changed.  A write to a TLB-protected location
will #PF.

	-hpa


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Sept. 30, 2015, 2:11 a.m. UTC | #15
"H. Peter Anvin" <hpa@zytor.com> writes:

> On 09/29/2015 06:20 PM, Eric W. Biederman wrote:
>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>> 
>>> On Tue, Sep 29, 2015 at 1:35 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>>>
>>>> Does anyone know what happens if you stick a non-accessed segment in
>>>> the GDT, map the GDT RO, and access it?
>>>
>>> You should get a #PF, as you guess, but go ahead and test it if you
>>> want to make sure.
>> 
>> I tested this by accident once when workinng on what has become known
>> as coreboot.  Early in boot with your GDT in a EEPROM switching from
>> real mode to 32bit protected mode causes a write and locks up the
>> machine when the hardware declines the write to the GDT to set the
>> accessed bit.  As I recall the write kept being retried and retried and
>> retried...
>> 
>> Setting the access bit in the GDT cleared up the problem and I did not
>> look back.
>> 
>> Way up in 64bit mode something might be different, but I don't know why
>> cpu designeres would waste the silicon.
>> 
>
> This is totally different from a TLB violation.  In your case, the write
> goes through as far as the CPU is concerned, but when the data is
> fetched back, it hasn't changed.  A write to a TLB-protected location
> will #PF.

The key point is that a write is generated when the cpu needs to set the
access bit.  I agree the failure points are different.  A TLB fault vs a
case where the hardware did not accept the write.

The idea of a cpu reading back data (and not trusting it's cache
coherency controls) to verify the access bit gets set seems mind
boggling.  That is slow, stupid, racy and incorrect.  Incorrect as the
cpu should not only set the access bit once per segment register load.

In my case I am pretty certain it was something very weird with the
hardware not acceppting the write and either not acknowledging the bus
transaction or cancelling it.  In which case the cpu knew the write had
not made it to the ``memory'' and was trying to cope.

Eric



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
H. Peter Anvin Sept. 30, 2015, 2:26 a.m. UTC | #16
No, it is a natural result of an implemention which treats setting the A bit as an abnormal flow (e.g. in microcode as opposed to hardware).

On September 29, 2015 7:11:59 PM PDT, ebiederm@xmission.com wrote:
>"H. Peter Anvin" <hpa@zytor.com> writes:
>
>> On 09/29/2015 06:20 PM, Eric W. Biederman wrote:
>>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>>> 
>>>> On Tue, Sep 29, 2015 at 1:35 PM, Andy Lutomirski
><luto@amacapital.net> wrote:
>>>>>
>>>>> Does anyone know what happens if you stick a non-accessed segment
>in
>>>>> the GDT, map the GDT RO, and access it?
>>>>
>>>> You should get a #PF, as you guess, but go ahead and test it if you
>>>> want to make sure.
>>> 
>>> I tested this by accident once when workinng on what has become
>known
>>> as coreboot.  Early in boot with your GDT in a EEPROM switching from
>>> real mode to 32bit protected mode causes a write and locks up the
>>> machine when the hardware declines the write to the GDT to set the
>>> accessed bit.  As I recall the write kept being retried and retried
>and
>>> retried...
>>> 
>>> Setting the access bit in the GDT cleared up the problem and I did
>not
>>> look back.
>>> 
>>> Way up in 64bit mode something might be different, but I don't know
>why
>>> cpu designeres would waste the silicon.
>>> 
>>
>> This is totally different from a TLB violation.  In your case, the
>write
>> goes through as far as the CPU is concerned, but when the data is
>> fetched back, it hasn't changed.  A write to a TLB-protected location
>> will #PF.
>
>The key point is that a write is generated when the cpu needs to set
>the
>access bit.  I agree the failure points are different.  A TLB fault vs
>a
>case where the hardware did not accept the write.
>
>The idea of a cpu reading back data (and not trusting it's cache
>coherency controls) to verify the access bit gets set seems mind
>boggling.  That is slow, stupid, racy and incorrect.  Incorrect as the
>cpu should not only set the access bit once per segment register load.
>
>In my case I am pretty certain it was something very weird with the
>hardware not acceppting the write and either not acknowledging the bus
>transaction or cancelling it.  In which case the cpu knew the write had
>not made it to the ``memory'' and was trying to cope.
>
>Eric
diff mbox

Patch

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2909bf..bc6ae1c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -429,7 +429,7 @@  ldt_ss:
  * compensating for the offset by changing to the ESPFIX segment with
  * a base address that matches for the difference.
  */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8)
 	mov	%esp, %edx			/* load kernel esp */
 	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
 	mov	%dx, %ax			/* eax: new kernel esp */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73..76de300 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -39,15 +39,21 @@  extern gate_desc idt_table[];
 extern struct desc_ptr debug_idt_descr;
 extern gate_desc debug_idt_table[];
 
-struct gdt_page {
+struct cpu_gdt {
 	struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+}
+#ifdef CONFIG_HYPERVISOR_GUEST
+/* Xen et al want GDT to have its own page. They remap it read-only */
+__attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt);
+#else
+____cacheline_aligned;
+DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt);
+#endif
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
-	return per_cpu(gdt_page, cpu).gdt;
+	return per_cpu(cpu_gdt, cpu).gdt;
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index de22ea7..6b90785 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,13 @@  static const struct cpu_dev default_cpu = {
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#ifdef CONFIG_HYPERVISOR_GUEST
+/* Xen et al want GDT to have its own page. They remap it read-only */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) =
+#else
+DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) =
+#endif
+{ .gdt = {
 #ifdef CONFIG_X86_64
 	/*
 	 * We need valid kernel segments for data and code in long mode too
@@ -144,7 +150,7 @@  DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 	GDT_STACK_CANARY_INIT
 #endif
 } };
-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
 
 static int __init x86_mpx_setup(char *s)
 {
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 66dd3fe9..41ebc94 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2198,7 +2198,7 @@  static unsigned long get_segment_base(unsigned int segment)
 		if (idx > GDT_ENTRIES)
 			return 0;
 
-		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+		desc = raw_cpu_ptr(cpu_gdt.gdt) + idx;
 	}
 
 	return get_desc_base(desc);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0e2d96f..33c9d10 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -521,7 +521,7 @@  setup_once:
 	 * relocation.  Manually set base address in stack canary
 	 * segment descriptor.
 	 */
-	movl $gdt_page,%eax
+	movl $cpu_gdt,%eax
 	movl $stack_canary,%ecx
 	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
 	shrl $16, %ecx
@@ -758,7 +758,7 @@  idt_descr:
 	.word 0				# 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
-	.long gdt_page			/* Overwritten for secondary CPUs */
+	.long cpu_gdt			/* Overwritten for secondary CPUs */
 
 /*
  * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 1d40ca8..5a80e8c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -510,7 +510,7 @@  NEXT_PAGE(level1_fixmap_pgt)
 early_gdt_descr:
 	.word	GDT_ENTRIES*8-1
 early_gdt_descr_base:
-	.quad	INIT_PER_CPU_VAR(gdt_page)
+	.quad	INIT_PER_CPU_VAR(cpu_gdt)
 
 ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf1..198e46b 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -348,7 +348,7 @@  SECTIONS
  * for the boot processor.
  */
 #define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(cpu_gdt);
 INIT_PER_CPU(irq_stack_union);
 
 /*
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0c2fae8..5a1da82 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -736,7 +736,7 @@  static void percpu_init(void)
  *
  * The "gold" linker incorrectly associates:
  *	init_per_cpu__irq_stack_union
- *	init_per_cpu__gdt_page
+ *	init_per_cpu__cpu_gdt
  */
 static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
 {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 30d12af..baecfc6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1592,10 +1592,10 @@  asmlinkage __visible void __init xen_start_kernel(void)
 
 	/*
 	 * The only reliable way to retain the initial address of the
-	 * percpu gdt_page is to remember it here, so we can go and
+	 * percpu cpu_gdt is to remember it here, so we can go and
 	 * mark it RW later, when the initial percpu area is freed.
 	 */
-	xen_initial_gdt = &per_cpu(gdt_page, 0);
+	xen_initial_gdt = &per_cpu(cpu_gdt, 0);
 
 	xen_smp_init();