diff mbox

xen/x86: Use 2M superpages for text/data/bss mappings

Message ID 1455818622-30625-4-git-send-email-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andrew Cooper Feb. 18, 2016, 6:03 p.m. UTC
This balloons the size of Xen in memory from 3.4MB to 12MB, because of the
required alignment adjustments.

However
 * All mappings are 2M superpages.
 * .text (and .init at boot) are the only sections marked executable.
 * .text and .rodata are marked read-only.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
---
 xen/arch/x86/setup.c     | 44 +++++++++++++++++++++++++++++++++++++++++---
 xen/arch/x86/xen.lds.S   | 33 +++++++++++++++++++++++++++++++++
 xen/include/xen/kernel.h |  6 ++++++
 3 files changed, 80 insertions(+), 3 deletions(-)

Comments

Jan Beulich Feb. 19, 2016, 2:58 p.m. UTC | #1
>>> On 18.02.16 at 19:03, <andrew.cooper3@citrix.com> wrote:
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -921,13 +921,51 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>              /* The only data mappings to be relocated are in the Xen area. */
>              pl2e = __va(__pa(l2_xenmap));
>              *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
> -                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
> +                                   PAGE_HYPERVISOR_RX | _PAGE_PSE);
>              for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
>              {
> +                unsigned int flags;
> +
>                  if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
>                      continue;
> -                *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
> -                                        xen_phys_start);
> +
> +                if ( /*
> +                      * Should be:
> +                      *
> +                      * i >= l2_table_offset((unsigned long)&__2M_text_start) &&
> +                      *
> +                      * but the EFI build can't manage the relocation.  It
> +                      * evaluates to 0, so just use the upper bound.
> +                      */
> +                     i < l2_table_offset((unsigned long)&__2M_text_end) )

I'll need some more detail about this, not the least because
excusing what looks like a hack with EFI, under which we won't
ever get here, is suspicious.

> +                {
> +                    flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
> +                }
> +                else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
> +                          i <  l2_table_offset((unsigned long)&__2M_rodata_end) )
> +                {
> +                    flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
> +                }
> +                else if ( (i >= l2_table_offset((unsigned long)&__2M_data_start) &&
> +                           i <  l2_table_offset((unsigned long)&__2M_data_end)) ||
> +                          (i >= l2_table_offset((unsigned long)&__2M_bss_start) &&
> +                           i <  l2_table_offset((unsigned long)&__2M_bss_end)) )

This is odd - why can't .data and .bss share a (multiple of) 2M
region, at once presumably getting the whole image down to 10M
again?

> --- a/xen/arch/x86/xen.lds.S
> +++ b/xen/arch/x86/xen.lds.S
> @@ -38,6 +38,9 @@ SECTIONS
>    . = __XEN_VIRT_START;
>    __image_base__ = .;
>  #endif
> +
> +  __2M_text_start = .;         /* Start of 2M superpages, mapped RX. */

Is the reason for aforementioned build problem perhaps the fact
that this label (and the others too) lives outside of any section?

> --- a/xen/include/xen/kernel.h
> +++ b/xen/include/xen/kernel.h
> @@ -65,6 +65,12 @@
>  	1;                                      \
>  })
>  
> +extern unsigned long __2M_text_start[], __2M_text_end[];
> +extern unsigned long __2M_rodata_start[], __2M_rodata_end[];
> +extern unsigned long __2M_data_start[], __2M_data_end[];
> +extern unsigned long __2M_init_start[], __2M_init_end[];
> +extern unsigned long __2M_bss_start[], __2M_bss_end[];

I'd really like to see at least the ones which are reference to r/o
sections marked const.

Jan
Andrew Cooper Feb. 19, 2016, 3:51 p.m. UTC | #2
On 19/02/16 14:58, Jan Beulich wrote:
>>>> On 18.02.16 at 19:03, <andrew.cooper3@citrix.com> wrote:
>> --- a/xen/arch/x86/setup.c
>> +++ b/xen/arch/x86/setup.c
>> @@ -921,13 +921,51 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>>              /* The only data mappings to be relocated are in the Xen area. */
>>              pl2e = __va(__pa(l2_xenmap));
>>              *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
>> -                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
>> +                                   PAGE_HYPERVISOR_RX | _PAGE_PSE);
>>              for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
>>              {
>> +                unsigned int flags;
>> +
>>                  if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
>>                      continue;
>> -                *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
>> -                                        xen_phys_start);
>> +
>> +                if ( /*
>> +                      * Should be:
>> +                      *
>> +                      * i >= l2_table_offset((unsigned long)&__2M_text_start) &&
>> +                      *
>> +                      * but the EFI build can't manage the relocation.  It
>> +                      * evaluates to 0, so just use the upper bound.
>> +                      */
>> +                     i < l2_table_offset((unsigned long)&__2M_text_end) )
> I'll need some more detail about this, not the least because
> excusing what looks like a hack with EFI, under which we won't
> ever get here, is suspicious.

Specifically, the EFI uses i386pep and it objects to a 64bit relocation
of 0xfffffffffffffffc.

I can't explain why this symbol ends up with that relocation.

>
>> +                {
>> +                    flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
>> +                }
>> +                else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
>> +                          i <  l2_table_offset((unsigned long)&__2M_rodata_end) )
>> +                {
>> +                    flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
>> +                }
>> +                else if ( (i >= l2_table_offset((unsigned long)&__2M_data_start) &&
>> +                           i <  l2_table_offset((unsigned long)&__2M_data_end)) ||
>> +                          (i >= l2_table_offset((unsigned long)&__2M_bss_start) &&
>> +                           i <  l2_table_offset((unsigned long)&__2M_bss_end)) )
> This is odd - why can't .data and .bss share a (multiple of) 2M
> region, at once presumably getting the whole image down to 10M
> again?

.init is between .data and .bss, to allow .bss to be the final section
and not included in the result of mkelf32

>
>> --- a/xen/arch/x86/xen.lds.S
>> +++ b/xen/arch/x86/xen.lds.S
>> @@ -38,6 +38,9 @@ SECTIONS
>>    . = __XEN_VIRT_START;
>>    __image_base__ = .;
>>  #endif
>> +
>> +  __2M_text_start = .;         /* Start of 2M superpages, mapped RX. */
> Is the reason for aforementioned build problem perhaps the fact
> that this label (and the others too) lives outside of any section?

I am not sure.  It is only this symbol which is a problem.  All others
are fine.

I actually intended this to be an RFC patch, to see if anyone had
suggestions.

>
>> --- a/xen/include/xen/kernel.h
>> +++ b/xen/include/xen/kernel.h
>> @@ -65,6 +65,12 @@
>>  	1;                                      \
>>  })
>>  
>> +extern unsigned long __2M_text_start[], __2M_text_end[];
>> +extern unsigned long __2M_rodata_start[], __2M_rodata_end[];
>> +extern unsigned long __2M_data_start[], __2M_data_end[];
>> +extern unsigned long __2M_init_start[], __2M_init_end[];
>> +extern unsigned long __2M_bss_start[], __2M_bss_end[];
> I'd really like to see at least the ones which are reference to r/o
> sections marked const.

Ok.  I should probably also make them char rather than unsigned long, so
pointer arithmetic works sensibly for the length.

~Andrew
Jan Beulich Feb. 22, 2016, 9:55 a.m. UTC | #3
>>> On 19.02.16 at 16:51, <andrew.cooper3@citrix.com> wrote:
> On 19/02/16 14:58, Jan Beulich wrote:
>>>>> On 18.02.16 at 19:03, <andrew.cooper3@citrix.com> wrote:
>>> --- a/xen/arch/x86/setup.c
>>> +++ b/xen/arch/x86/setup.c
>>> @@ -921,13 +921,51 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>>>              /* The only data mappings to be relocated are in the Xen area. */
>>>              pl2e = __va(__pa(l2_xenmap));
>>>              *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
>>> -                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
>>> +                                   PAGE_HYPERVISOR_RX | _PAGE_PSE);
>>>              for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
>>>              {
>>> +                unsigned int flags;
>>> +
>>>                  if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
>>>                      continue;
>>> -                *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
>>> -                                        xen_phys_start);
>>> +
>>> +                if ( /*
>>> +                      * Should be:
>>> +                      *
>>> +                      * i >= l2_table_offset((unsigned long)&__2M_text_start) &&
>>> +                      *
>>> +                      * but the EFI build can't manage the relocation.  It
>>> +                      * evaluates to 0, so just use the upper bound.
>>> +                      */
>>> +                     i < l2_table_offset((unsigned long)&__2M_text_end) )
>> I'll need some more detail about this, not the least because
>> excusing what looks like a hack with EFI, under which we won't
>> ever get here, is suspicious.
> 
> Specifically, the EFI uses i386pep and it objects to a 64bit relocation
> of 0xfffffffffffffffc.
> 
> I can't explain why this symbol ends up with that relocation.

Interesting.

>>> +                {
>>> +                    flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
>>> +                }
>>> +                else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
>>> +                          i <  l2_table_offset((unsigned long)&__2M_rodata_end) )
>>> +                {
>>> +                    flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
>>> +                }
>>> +                else if ( (i >= l2_table_offset((unsigned long)&__2M_data_start) &&
>>> +                           i <  l2_table_offset((unsigned long)&__2M_data_end)) ||
>>> +                          (i >= l2_table_offset((unsigned long)&__2M_bss_start) &&
>>> +                           i <  l2_table_offset((unsigned long)&__2M_bss_end)) )
>> This is odd - why can't .data and .bss share a (multiple of) 2M
>> region, at once presumably getting the whole image down to 10M
>> again?
> 
> .init is between .data and .bss, to allow .bss to be the final section
> and not included in the result of mkelf32

But I don't think it needs to remain there? Should be possible to be
put between .text and .rodata, or between .rodata and .data ...

>>> --- a/xen/arch/x86/xen.lds.S
>>> +++ b/xen/arch/x86/xen.lds.S
>>> @@ -38,6 +38,9 @@ SECTIONS
>>>    . = __XEN_VIRT_START;
>>>    __image_base__ = .;
>>>  #endif
>>> +
>>> +  __2M_text_start = .;         /* Start of 2M superpages, mapped RX. */
>> Is the reason for aforementioned build problem perhaps the fact
>> that this label (and the others too) lives outside of any section?
> 
> I am not sure.  It is only this symbol which is a problem.  All others
> are fine.
> 
> I actually intended this to be an RFC patch, to see if anyone had
> suggestions.

Since you now imply this to be at the image base, I don't see
why using e.g. __XEN_VIRT_START (in its place, or via #define)
wouldn't be as good an option.

>>> --- a/xen/include/xen/kernel.h
>>> +++ b/xen/include/xen/kernel.h
>>> @@ -65,6 +65,12 @@
>>>  	1;                                      \
>>>  })
>>>  
>>> +extern unsigned long __2M_text_start[], __2M_text_end[];
>>> +extern unsigned long __2M_rodata_start[], __2M_rodata_end[];
>>> +extern unsigned long __2M_data_start[], __2M_data_end[];
>>> +extern unsigned long __2M_init_start[], __2M_init_end[];
>>> +extern unsigned long __2M_bss_start[], __2M_bss_end[];
>> I'd really like to see at least the ones which are reference to r/o
>> sections marked const.
> 
> Ok.  I should probably also make them char rather than unsigned long, so
> pointer arithmetic works sensibly for the length.

Good idea indeed.

Jan
Andrew Cooper Feb. 22, 2016, 10:24 a.m. UTC | #4
On 22/02/16 09:55, Jan Beulich wrote:
>
>>>> +                {
>>>> +                    flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
>>>> +                }
>>>> +                else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
>>>> +                          i <  l2_table_offset((unsigned long)&__2M_rodata_end) )
>>>> +                {
>>>> +                    flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
>>>> +                }
>>>> +                else if ( (i >= l2_table_offset((unsigned long)&__2M_data_start) &&
>>>> +                           i <  l2_table_offset((unsigned long)&__2M_data_end)) ||
>>>> +                          (i >= l2_table_offset((unsigned long)&__2M_bss_start) &&
>>>> +                           i <  l2_table_offset((unsigned long)&__2M_bss_end)) )
>>> This is odd - why can't .data and .bss share a (multiple of) 2M
>>> region, at once presumably getting the whole image down to 10M
>>> again?
>> .init is between .data and .bss, to allow .bss to be the final section
>> and not included in the result of mkelf32
> But I don't think it needs to remain there? Should be possible to be
> put between .text and .rodata, or between .rodata and .data ...

Actually yes - shifting .init to between .rodata and .data should work fine.

>
>>>> --- a/xen/arch/x86/xen.lds.S
>>>> +++ b/xen/arch/x86/xen.lds.S
>>>> @@ -38,6 +38,9 @@ SECTIONS
>>>>    . = __XEN_VIRT_START;
>>>>    __image_base__ = .;
>>>>  #endif
>>>> +
>>>> +  __2M_text_start = .;         /* Start of 2M superpages, mapped RX. */
>>> Is the reason for aforementioned build problem perhaps the fact
>>> that this label (and the others too) lives outside of any section?
>> I am not sure.  It is only this symbol which is a problem.  All others
>> are fine.
>>
>> I actually intended this to be an RFC patch, to see if anyone had
>> suggestions.
> Since you now imply this to be at the image base, I don't see
> why using e.g. __XEN_VIRT_START (in its place, or via #define)
> wouldn't be as good an option.

I don't understand what you mean here.

If you are suggesting #define __2M_text_start __XEN_VIRT_START then I
don't see how that is going to help with the relocation.

~Andrew
Jan Beulich Feb. 22, 2016, 10:43 a.m. UTC | #5
>>> On 22.02.16 at 11:24, <andrew.cooper3@citrix.com> wrote:
> On 22/02/16 09:55, Jan Beulich wrote:
>>>>> --- a/xen/arch/x86/xen.lds.S
>>>>> +++ b/xen/arch/x86/xen.lds.S
>>>>> @@ -38,6 +38,9 @@ SECTIONS
>>>>>    . = __XEN_VIRT_START;
>>>>>    __image_base__ = .;
>>>>>  #endif
>>>>> +
>>>>> +  __2M_text_start = .;         /* Start of 2M superpages, mapped RX. */
>>>> Is the reason for aforementioned build problem perhaps the fact
>>>> that this label (and the others too) lives outside of any section?
>>> I am not sure.  It is only this symbol which is a problem.  All others
>>> are fine.
>>>
>>> I actually intended this to be an RFC patch, to see if anyone had
>>> suggestions.
>> Since you now imply this to be at the image base, I don't see
>> why using e.g. __XEN_VIRT_START (in its place, or via #define)
>> wouldn't be as good an option.
> 
> I don't understand what you mean here.
> 
> If you are suggesting #define __2M_text_start __XEN_VIRT_START then I
> don't see how that is going to help with the relocation.

Why not? __XEN_VIRT_START is a constant, i.e. won't result in
any relocations to get emitted.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index cddf954..c360fbc 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -921,13 +921,51 @@  void __init noreturn __start_xen(unsigned long mbi_p)
             /* The only data mappings to be relocated are in the Xen area. */
             pl2e = __va(__pa(l2_xenmap));
             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
-                                   PAGE_HYPERVISOR_RWX | _PAGE_PSE);
+                                   PAGE_HYPERVISOR_RX | _PAGE_PSE);
             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
             {
+                unsigned int flags;
+
                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
                     continue;
-                *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
-                                        xen_phys_start);
+
+                if ( /*
+                      * Should be:
+                      *
+                      * i >= l2_table_offset((unsigned long)&__2M_text_start) &&
+                      *
+                      * but the EFI build can't manage the relocation.  It
+                      * evaluates to 0, so just use the upper bound.
+                      */
+                     i < l2_table_offset((unsigned long)&__2M_text_end) )
+                {
+                    flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
+                }
+                else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
+                          i <  l2_table_offset((unsigned long)&__2M_rodata_end) )
+                {
+                    flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
+                }
+                else if ( (i >= l2_table_offset((unsigned long)&__2M_data_start) &&
+                           i <  l2_table_offset((unsigned long)&__2M_data_end)) ||
+                          (i >= l2_table_offset((unsigned long)&__2M_bss_start) &&
+                           i <  l2_table_offset((unsigned long)&__2M_bss_end)) )
+                {
+                    flags = PAGE_HYPERVISOR_RW | _PAGE_PSE;
+                }
+                else if ( i >= l2_table_offset((unsigned long)&__2M_init_start) &&
+                          i <  l2_table_offset((unsigned long)&__2M_init_end) )
+                {
+                    flags = PAGE_HYPERVISOR_RWX | _PAGE_PSE;
+                }
+                else
+                {
+                    *pl2e = l2e_empty();
+                    continue;
+                }
+
+                *pl2e = l2e_from_paddr(
+                    l2e_get_paddr(*pl2e) + xen_phys_start, flags);
             }
 
             /* Re-sync the stack and then switch to relocated pagetables. */
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index 9fde1db..6c23c02 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -38,6 +38,9 @@  SECTIONS
   . = __XEN_VIRT_START;
   __image_base__ = .;
 #endif
+
+  __2M_text_start = .;         /* Start of 2M superpages, mapped RX. */
+
   . = __XEN_VIRT_START + MB(1);
   _start = .;
   .text : {
@@ -50,6 +53,10 @@  SECTIONS
        _etext = .;             /* End of text section */
   } :text = 0x9090
 
+  . = ALIGN(MB(2));
+  __2M_text_end = .;
+
+  __2M_rodata_start = .;       /* Start of 2M superpages, mapped RO. */
   .rodata : {
        /* Bug frames table */
        . = ALIGN(4);
@@ -67,6 +74,10 @@  SECTIONS
        *(.rodata.*)
   } :text
 
+  . = ALIGN(MB(2));
+  __2M_rodata_end = .;
+
+  __2M_data_start = .;         /* Start of 2M superpages, mapped RW. */
   . = ALIGN(SMP_CACHE_BYTES);
   .data.read_mostly : {
        /* Exception table */
@@ -104,6 +115,10 @@  SECTIONS
   __lock_profile_end = .;
 #endif
 
+  . = ALIGN(MB(2));
+  __2M_data_end = .;
+
+  __2M_init_start = .;         /* Start of 2M superpages, mapped RWX (boot only). */
   . = ALIGN(PAGE_SIZE);             /* Init code and data */
   __init_begin = .;
   .init.text : {
@@ -166,6 +181,10 @@  SECTIONS
   . = ALIGN(STACK_SIZE);
   __init_end = .;
 
+  . = ALIGN(MB(2));
+  __2M_init_end = .;
+
+  __2M_bss_start = .;          /* Start of 2M superpages, mapped RW. */
   .bss : {                     /* BSS */
        __bss_start = .;
        *(.bss.stack_aligned)
@@ -183,6 +202,9 @@  SECTIONS
   } :text
   _end = . ;
 
+  . = ALIGN(MB(2));
+  __2M_bss_end = .;
+
 #ifdef EFI
   . = ALIGN(4);
   .reloc : {
@@ -229,4 +251,15 @@  ASSERT(__image_base__ > XEN_VIRT_START ||
 ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large")
 #endif
 
+ASSERT(IS_ALIGNED(__2M_text_start,   MB(2)), "__2M_text_start misaligned")
+ASSERT(IS_ALIGNED(__2M_text_end,     MB(2)), "__2M_text_end misaligned")
+ASSERT(IS_ALIGNED(__2M_rodata_start, MB(2)), "__2M_rodata_start misaligned")
+ASSERT(IS_ALIGNED(__2M_rodata_end,   MB(2)), "__2M_rodata_end misaligned")
+ASSERT(IS_ALIGNED(__2M_data_start,   MB(2)), "__2M_data_start misaligned")
+ASSERT(IS_ALIGNED(__2M_data_end,     MB(2)), "__2M_data_end misaligned")
+ASSERT(IS_ALIGNED(__2M_init_start,   MB(2)), "__2M_init_start misaligned")
+ASSERT(IS_ALIGNED(__2M_init_end,     MB(2)), "__2M_init_end misaligned")
+ASSERT(IS_ALIGNED(__2M_bss_start,    MB(2)), "__2M_bss_start misaligned")
+ASSERT(IS_ALIGNED(__2M_bss_end,      MB(2)), "__2M_bss_end misaligned")
+
 ASSERT(IS_ALIGNED(cpu0_stack, STACK_SIZE), "cpu0_stack misaligned")
diff --git a/xen/include/xen/kernel.h b/xen/include/xen/kernel.h
index 548b64d..8d08eca 100644
--- a/xen/include/xen/kernel.h
+++ b/xen/include/xen/kernel.h
@@ -65,6 +65,12 @@ 
 	1;                                      \
 })
 
+extern unsigned long __2M_text_start[], __2M_text_end[];
+extern unsigned long __2M_rodata_start[], __2M_rodata_end[];
+extern unsigned long __2M_data_start[], __2M_data_end[];
+extern unsigned long __2M_init_start[], __2M_init_end[];
+extern unsigned long __2M_bss_start[], __2M_bss_end[];
+
 extern char _start[], _end[], start[];
 #define is_kernel(p) ({                         \
     char *__p = (char *)(unsigned long)(p);     \