diff mbox

[2/8] x86/head: Refactor 32-bit pgtable setup

Message ID 1476468318-24422-3-git-send-email-boris.ostrovsky@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris Ostrovsky Oct. 14, 2016, 6:05 p.m. UTC
From: Matt Fleming <matt@codeblueprint.co.uk>

The new Xen PVH entry point requires page tables to be setup by the
kernel since it is entered with paging disabled.

Pull the common code out of head_32.S and into pgtable_32.S so that
setup_pgtable_32 can be invoked from both the new Xen entry point and
the existing startup_32 code.

Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
---
 arch/x86/Makefile            |   2 +
 arch/x86/kernel/Makefile     |   2 +
 arch/x86/kernel/head_32.S    | 168 +------------------------------------
 arch/x86/kernel/pgtable_32.S | 196 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 201 insertions(+), 167 deletions(-)
 create mode 100644 arch/x86/kernel/pgtable_32.S

Comments

H. Peter Anvin Oct. 14, 2016, 6:31 p.m. UTC | #1
On October 14, 2016 11:05:12 AM PDT, Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>From: Matt Fleming <matt@codeblueprint.co.uk>
>
>The new Xen PVH entry point requires page tables to be setup by the
>kernel since it is entered with paging disabled.
>
>Pull the common code out of head_32.S and into pgtable_32.S so that
>setup_pgtable_32 can be invoked from both the new Xen entry point and
>the existing startup_32 code.
>
>Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>Cc: Thomas Gleixner <tglx@linutronix.de>
>Cc: Ingo Molnar <mingo@redhat.com>
>Cc: "H. Peter Anvin" <hpa@zytor.com>
>Cc: x86@kernel.org
>Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
>---
> arch/x86/Makefile            |   2 +
> arch/x86/kernel/Makefile     |   2 +
>arch/x86/kernel/head_32.S    | 168
>+------------------------------------
>arch/x86/kernel/pgtable_32.S | 196
>+++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 201 insertions(+), 167 deletions(-)
> create mode 100644 arch/x86/kernel/pgtable_32.S
>
>diff --git a/arch/x86/Makefile b/arch/x86/Makefile
>index 2d44933..67cc771 100644
>--- a/arch/x86/Makefile
>+++ b/arch/x86/Makefile
>@@ -204,6 +204,8 @@ head-y += arch/x86/kernel/head$(BITS).o
> head-y += arch/x86/kernel/ebda.o
> head-y += arch/x86/kernel/platform-quirks.o
> 
>+head-$(CONFIG_X86_32) += arch/x86/kernel/pgtable_32.o
>+
> libs-y  += arch/x86/lib/
> 
> # See arch/x86/Kbuild for content of core part of the kernel
>diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
>index 4dd5d50..eae85a5 100644
>--- a/arch/x86/kernel/Makefile
>+++ b/arch/x86/kernel/Makefile
>@@ -8,6 +8,8 @@ extra-y	+= ebda.o
> extra-y	+= platform-quirks.o
> extra-y	+= vmlinux.lds
> 
>+extra-$(CONFIG_X86_32) += pgtable_32.o
>+
> CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
> 
> ifdef CONFIG_FUNCTION_TRACER
>diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
>index 5f40126..0db066e 100644
>--- a/arch/x86/kernel/head_32.S
>+++ b/arch/x86/kernel/head_32.S
>@@ -41,51 +41,6 @@
> #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
> 
> /*
>- * This is how much memory in addition to the memory covered up to
>- * and including _end we need mapped initially.
>- * We need:
>- *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
>- *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
>- *
>- * Modulo rounding, each megabyte assigned here requires a kilobyte of
>- * memory, which is currently unreclaimed.
>- *
>- * This should be a multiple of a page.
>- *
>- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
>- * and small than max_low_pfn, otherwise will waste some page table
>entries
>- */
>-
>-#if PTRS_PER_PMD > 1
>-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) +
>PTRS_PER_PGD)
>-#else
>-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
>-#endif
>-
>-/*
>- * Number of possible pages in the lowmem region.
>- *
>- * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
>- * gas warning about overflowing shift count when gas has been
>compiled
>- * with only a host target support using a 32-bit type for internal
>- * representation.
>- */
>-LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
>-
>-/* Enough space to fit pagetables for the low memory linear map */
>-MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
>-
>-/*
>- * Worst-case size of the kernel mapping we need to make:
>- * a relocatable kernel can live anywhere in lowmem, so we need to be
>able
>- * to map all of lowmem.
>- */
>-KERNEL_PAGES = LOWMEM_PAGES
>-
>-INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
>-RESERVE_BRK(pagetables, INIT_MAP_SIZE)
>-
>-/*
>  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
>  * %esi points to the real-mode code as a 32-bit pointer.
>  * CS and DS must be 4 GB flat segments, but we don't depend on
>@@ -157,92 +112,7 @@ ENTRY(startup_32)
> 	call load_ucode_bsp
> #endif
> 
>-/*
>- * Initialize page tables.  This creates a PDE and a set of page
>- * tables, which are located immediately beyond __brk_base.  The
>variable
>- * _brk_end is set up to point to the first "safe" location.
>- * Mappings are created both at virtual address 0 (identity mapping)
>- * and PAGE_OFFSET for up to _end.
>- */
>-#ifdef CONFIG_X86_PAE
>-
>-	/*
>-	 * In PAE mode initial_page_table is statically defined to contain
>-	 * enough entries to cover the VMSPLIT option (that is the top 1, 2
>or 3
>-	 * entries). The identity mapping is handled by pointing two PGD
>entries
>-	 * to the first kernel PMD.
>-	 *
>-	 * Note the upper half of each PMD or PTE are always zero at this
>stage.
>-	 */
>-
>-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs
>*/
>-
>-	xorl %ebx,%ebx				/* %ebx is kept at zero */
>-
>-	movl $pa(__brk_base), %edi
>-	movl $pa(initial_pg_pmd), %edx
>-	movl $PTE_IDENT_ATTR, %eax
>-10:
>-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
>-	movl %ecx,(%edx)			/* Store PMD entry */
>-						/* Upper half already zero */
>-	addl $8,%edx
>-	movl $512,%ecx
>-11:
>-	stosl
>-	xchgl %eax,%ebx
>-	stosl
>-	xchgl %eax,%ebx
>-	addl $0x1000,%eax
>-	loop 11b
>-
>-	/*
>-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
>-	 */
>-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>-	cmpl %ebp,%eax
>-	jb 10b
>-1:
>-	addl $__PAGE_OFFSET, %edi
>-	movl %edi, pa(_brk_end)
>-	shrl $12, %eax
>-	movl %eax, pa(max_pfn_mapped)
>-
>-	/* Do early initialization of the fixmap area */
>-	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>-	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
>-#else	/* Not PAE */
>-
>-page_pde_offset = (__PAGE_OFFSET >> 20);
>-
>-	movl $pa(__brk_base), %edi
>-	movl $pa(initial_page_table), %edx
>-	movl $PTE_IDENT_ATTR, %eax
>-10:
>-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
>-	movl %ecx,(%edx)			/* Store identity PDE entry */
>-	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
>-	addl $4,%edx
>-	movl $1024, %ecx
>-11:
>-	stosl
>-	addl $0x1000,%eax
>-	loop 11b
>-	/*
>-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
>-	 */
>-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>-	cmpl %ebp,%eax
>-	jb 10b
>-	addl $__PAGE_OFFSET, %edi
>-	movl %edi, pa(_brk_end)
>-	shrl $12, %eax
>-	movl %eax, pa(max_pfn_mapped)
>-
>-	/* Do early initialization of the fixmap area */
>-	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>-	movl %eax,pa(initial_page_table+0xffc)
>-#endif
>+	call setup_pgtable_32
> 
> #ifdef CONFIG_PARAVIRT
> 	/* This is can only trip for a broken bootloader... */
>@@ -660,47 +530,11 @@ ENTRY(setup_once_ref)
>  */
> __PAGE_ALIGNED_BSS
> 	.align PAGE_SIZE
>-#ifdef CONFIG_X86_PAE
>-initial_pg_pmd:
>-	.fill 1024*KPMDS,4,0
>-#else
>-ENTRY(initial_page_table)
>-	.fill 1024,4,0
>-#endif
>-initial_pg_fixmap:
>-	.fill 1024,4,0
> ENTRY(empty_zero_page)
> 	.fill 4096,1,0
> ENTRY(swapper_pg_dir)
> 	.fill 1024,4,0
> 
>-/*
>- * This starts the data section.
>- */
>-#ifdef CONFIG_X86_PAE
>-__PAGE_ALIGNED_DATA
>-	/* Page-aligned for the benefit of paravirt? */
>-	.align PAGE_SIZE
>-ENTRY(initial_page_table)
>-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
>-# if KPMDS == 3
>-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
>-# elif KPMDS == 2
>-	.long	0,0
>-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>-# elif KPMDS == 1
>-	.long	0,0
>-	.long	0,0
>-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>-# else
>-#  error "Kernel PMDs should be 1, 2 or 3"
>-# endif
>-	.align PAGE_SIZE		/* needs to be page-sized too */
>-#endif
>-
> .data
> .balign 4
> ENTRY(initial_stack)
>diff --git a/arch/x86/kernel/pgtable_32.S
>b/arch/x86/kernel/pgtable_32.S
>new file mode 100644
>index 0000000..aded718
>--- /dev/null
>+++ b/arch/x86/kernel/pgtable_32.S
>@@ -0,0 +1,196 @@
>+#include <linux/threads.h>
>+#include <linux/init.h>
>+#include <linux/linkage.h>
>+#include <asm/segment.h>
>+#include <asm/page_types.h>
>+#include <asm/pgtable_types.h>
>+#include <asm/cache.h>
>+#include <asm/thread_info.h>
>+#include <asm/asm-offsets.h>
>+#include <asm/setup.h>
>+#include <asm/processor-flags.h>
>+#include <asm/msr-index.h>
>+#include <asm/cpufeatures.h>
>+#include <asm/percpu.h>
>+#include <asm/nops.h>
>+#include <asm/bootparam.h>
>+
>+/* Physical address */
>+#define pa(X) ((X) - __PAGE_OFFSET)
>+
>+/*
>+ * This is how much memory in addition to the memory covered up to
>+ * and including _end we need mapped initially.
>+ * We need:
>+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
>+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
>+ *
>+ * Modulo rounding, each megabyte assigned here requires a kilobyte of
>+ * memory, which is currently unreclaimed.
>+ *
>+ * This should be a multiple of a page.
>+ *
>+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
>+ * and small than max_low_pfn, otherwise will waste some page table
>entries
>+ */
>+
>+#if PTRS_PER_PMD > 1
>+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) +
>PTRS_PER_PGD)
>+#else
>+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
>+#endif
>+
>+/*
>+ * Number of possible pages in the lowmem region.
>+ *
>+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
>+ * gas warning about overflowing shift count when gas has been
>compiled
>+ * with only a host target support using a 32-bit type for internal
>+ * representation.
>+ */
>+LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
>+
>+/* Enough space to fit pagetables for the low memory linear map */
>+MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
>+
>+/*
>+ * Worst-case size of the kernel mapping we need to make:
>+ * a relocatable kernel can live anywhere in lowmem, so we need to be
>able
>+ * to map all of lowmem.
>+ */
>+KERNEL_PAGES = LOWMEM_PAGES
>+
>+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
>+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
>+
>+/*
>+ * Initialize page tables.  This creates a PDE and a set of page
>+ * tables, which are located immediately beyond __brk_base.  The
>variable
>+ * _brk_end is set up to point to the first "safe" location.
>+ * Mappings are created both at virtual address 0 (identity mapping)
>+ * and PAGE_OFFSET for up to _end.
>+ */
>+	.text
>+ENTRY(setup_pgtable_32)
>+#ifdef CONFIG_X86_PAE
>+	/*
>+	 * In PAE mode initial_page_table is statically defined to contain
>+	 * enough entries to cover the VMSPLIT option (that is the top 1, 2
>or 3
>+	 * entries). The identity mapping is handled by pointing two PGD
>entries
>+	 * to the first kernel PMD.
>+	 *
>+	 * Note the upper half of each PMD or PTE are always zero at this
>stage.
>+	 */
>+
>+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs
>*/
>+
>+	xorl %ebx,%ebx				/* %ebx is kept at zero */
>+
>+	movl $pa(__brk_base), %edi
>+	movl $pa(initial_pg_pmd), %edx
>+	movl $PTE_IDENT_ATTR, %eax
>+10:
>+	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
>+	movl %ecx,(%edx)			/* Store PMD entry */
>+						/* Upper half already zero */
>+	addl $8,%edx
>+	movl $512,%ecx
>+11:
>+	stosl
>+	xchgl %eax,%ebx
>+	stosl
>+	xchgl %eax,%ebx
>+	addl $0x1000,%eax
>+	loop 11b
>+
>+	/*
>+	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
>+	 */
>+	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>+	cmpl %ebp,%eax
>+	jb 10b
>+1:
>+	addl $__PAGE_OFFSET, %edi
>+	movl %edi, pa(_brk_end)
>+	shrl $12, %eax
>+	movl %eax, pa(max_pfn_mapped)
>+
>+	/* Do early initialization of the fixmap area */
>+	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>+	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
>+#else	/* Not PAE */
>+
>+page_pde_offset = (__PAGE_OFFSET >> 20);
>+
>+	movl $pa(__brk_base), %edi
>+	movl $pa(initial_page_table), %edx
>+	movl $PTE_IDENT_ATTR, %eax
>+10:
>+	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
>+	movl %ecx,(%edx)			/* Store identity PDE entry */
>+	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
>+	addl $4,%edx
>+	movl $1024, %ecx
>+11:
>+	stosl
>+	addl $0x1000,%eax
>+	loop 11b
>+	/*
>+	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
>+	 */
>+	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>+	cmpl %ebp,%eax
>+	jb 10b
>+	addl $__PAGE_OFFSET, %edi
>+	movl %edi, pa(_brk_end)
>+	shrl $12, %eax
>+	movl %eax, pa(max_pfn_mapped)
>+
>+	/* Do early initialization of the fixmap area */
>+	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>+	movl %eax,pa(initial_page_table+0xffc)
>+#endif
>+	ret
>+ENDPROC(setup_pgtable_32)
>+
>+/*
>+ * BSS section
>+ */
>+__PAGE_ALIGNED_BSS
>+	.align PAGE_SIZE
>+#ifdef CONFIG_X86_PAE
>+initial_pg_pmd:
>+	.fill 1024*KPMDS,4,0
>+#else
>+ENTRY(initial_page_table)
>+	.fill 1024,4,0
>+#endif
>+initial_pg_fixmap:
>+	.fill 1024,4,0
>+
>+/*
>+ * This starts the data section.
>+ */
>+#ifdef CONFIG_X86_PAE
>+__PAGE_ALIGNED_DATA
>+	/* Page-aligned for the benefit of paravirt? */
>+	.align PAGE_SIZE
>+ENTRY(initial_page_table)
>+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
>+# if KPMDS == 3
>+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
>+# elif KPMDS == 2
>+	.long	0,0
>+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>+# elif KPMDS == 1
>+	.long	0,0
>+	.long	0,0
>+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>+# else
>+#  error "Kernel PMDs should be 1, 2 or 3"
>+# endif
>+	.align PAGE_SIZE		/* needs to be page-sized too */
>+#endif

And why does it need a separate entry point as opposed to the plain one?
Boris Ostrovsky Oct. 14, 2016, 6:44 p.m. UTC | #2
On 10/14/2016 02:31 PM, hpa@zytor.com wrote:
> On October 14, 2016 11:05:12 AM PDT, Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>> From: Matt Fleming <matt@codeblueprint.co.uk>
>>
>> The new Xen PVH entry point requires page tables to be setup by the
>> kernel since it is entered with paging disabled.
>>
>> Pull the common code out of head_32.S and into pgtable_32.S so that
>> setup_pgtable_32 can be invoked from both the new Xen entry point and
>> the existing startup_32 code.
>>
> And why does it need a separate entry point as opposed to the plain one?

One reason is that we need to prepare boot_params before jumping to
startup_{32|64}.

When the guest is loaded (always in 32-bit mode) the only thing we have
is a pointer to Xen-specific datastructure. The early PVH code will
prepare zeropage based on that structure and then jump to regular
startup_*() code.

-boris
H. Peter Anvin Oct. 14, 2016, 7:04 p.m. UTC | #3
On October 14, 2016 11:44:18 AM PDT, Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>On 10/14/2016 02:31 PM, hpa@zytor.com wrote:
>> On October 14, 2016 11:05:12 AM PDT, Boris Ostrovsky
><boris.ostrovsky@oracle.com> wrote:
>>> From: Matt Fleming <matt@codeblueprint.co.uk>
>>>
>>> The new Xen PVH entry point requires page tables to be setup by the
>>> kernel since it is entered with paging disabled.
>>>
>>> Pull the common code out of head_32.S and into pgtable_32.S so that
>>> setup_pgtable_32 can be invoked from both the new Xen entry point
>and
>>> the existing startup_32 code.
>>>
>> And why does it need a separate entry point as opposed to the plain
>one?
>
>One reason is that we need to prepare boot_params before jumping to
>startup_{32|64}.
>
>When the guest is loaded (always in 32-bit mode) the only thing we have
>is a pointer to Xen-specific datastructure. The early PVH code will
>prepare zeropage based on that structure and then jump to regular
>startup_*() code.
>
>-boris

And why not just resume execution at start_32 then?
Boris Ostrovsky Oct. 14, 2016, 7:18 p.m. UTC | #4
On 10/14/2016 03:04 PM, hpa@zytor.com wrote:
> On October 14, 2016 11:44:18 AM PDT, Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>> On 10/14/2016 02:31 PM, hpa@zytor.com wrote:
>>> On October 14, 2016 11:05:12 AM PDT, Boris Ostrovsky
>> <boris.ostrovsky@oracle.com> wrote:
>>>> From: Matt Fleming <matt@codeblueprint.co.uk>
>>>>
>>>> The new Xen PVH entry point requires page tables to be setup by the
>>>> kernel since it is entered with paging disabled.
>>>>
>>>> Pull the common code out of head_32.S and into pgtable_32.S so that
>>>> setup_pgtable_32 can be invoked from both the new Xen entry point
>> and
>>>> the existing startup_32 code.
>>>>
>>> And why does it need a separate entry point as opposed to the plain
>> one?
>>
>> One reason is that we need to prepare boot_params before jumping to
>> startup_{32|64}.
>>
>> When the guest is loaded (always in 32-bit mode) the only thing we have
>> is a pointer to Xen-specific datastructure. The early PVH code will
>> prepare zeropage based on that structure and then jump to regular
>> startup_*() code.
>>
>> -boris
> And why not just resume execution at start_32 then?

I am not sure what start_32 is.

If you meant startup_32 then that's exactly what we do (for 32-bit
guests) once zeropage is set up.

-boris


-boris
Boris Ostrovsky Oct. 31, 2016, 12:33 p.m. UTC | #5
On 10/14/2016 02:05 PM, Boris Ostrovsky wrote:
> From: Matt Fleming <matt@codeblueprint.co.uk>
>
> The new Xen PVH entry point requires page tables to be setup by the
> kernel since it is entered with paging disabled.
>
> Pull the common code out of head_32.S and into pgtable_32.S so that
> setup_pgtable_32 can be invoked from both the new Xen entry point and
> the existing startup_32 code.


Ping to x86 maintainers.

Peter, you had questions about this patch. Did I answer them?

-boris


>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
> ---
>  arch/x86/Makefile            |   2 +
>  arch/x86/kernel/Makefile     |   2 +
>  arch/x86/kernel/head_32.S    | 168 +------------------------------------
>  arch/x86/kernel/pgtable_32.S | 196 +++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 201 insertions(+), 167 deletions(-)
>  create mode 100644 arch/x86/kernel/pgtable_32.S
>
> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> index 2d44933..67cc771 100644
> --- a/arch/x86/Makefile
> +++ b/arch/x86/Makefile
> @@ -204,6 +204,8 @@ head-y += arch/x86/kernel/head$(BITS).o
>  head-y += arch/x86/kernel/ebda.o
>  head-y += arch/x86/kernel/platform-quirks.o
>
> +head-$(CONFIG_X86_32) += arch/x86/kernel/pgtable_32.o
> +
>  libs-y  += arch/x86/lib/
>
>  # See arch/x86/Kbuild for content of core part of the kernel
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 4dd5d50..eae85a5 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -8,6 +8,8 @@ extra-y	+= ebda.o
>  extra-y	+= platform-quirks.o
>  extra-y	+= vmlinux.lds
>
> +extra-$(CONFIG_X86_32) += pgtable_32.o
> +
>  CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
>
>  ifdef CONFIG_FUNCTION_TRACER
> diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
> index 5f40126..0db066e 100644
> --- a/arch/x86/kernel/head_32.S
> +++ b/arch/x86/kernel/head_32.S
> @@ -41,51 +41,6 @@
>  #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
>
>  /*
> - * This is how much memory in addition to the memory covered up to
> - * and including _end we need mapped initially.
> - * We need:
> - *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
> - *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
> - *
> - * Modulo rounding, each megabyte assigned here requires a kilobyte of
> - * memory, which is currently unreclaimed.
> - *
> - * This should be a multiple of a page.
> - *
> - * KERNEL_IMAGE_SIZE should be greater than pa(_end)
> - * and small than max_low_pfn, otherwise will waste some page table entries
> - */
> -
> -#if PTRS_PER_PMD > 1
> -#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
> -#else
> -#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
> -#endif
> -
> -/*
> - * Number of possible pages in the lowmem region.
> - *
> - * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
> - * gas warning about overflowing shift count when gas has been compiled
> - * with only a host target support using a 32-bit type for internal
> - * representation.
> - */
> -LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
> -
> -/* Enough space to fit pagetables for the low memory linear map */
> -MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
> -
> -/*
> - * Worst-case size of the kernel mapping we need to make:
> - * a relocatable kernel can live anywhere in lowmem, so we need to be able
> - * to map all of lowmem.
> - */
> -KERNEL_PAGES = LOWMEM_PAGES
> -
> -INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
> -RESERVE_BRK(pagetables, INIT_MAP_SIZE)
> -
> -/*
>   * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
>   * %esi points to the real-mode code as a 32-bit pointer.
>   * CS and DS must be 4 GB flat segments, but we don't depend on
> @@ -157,92 +112,7 @@ ENTRY(startup_32)
>  	call load_ucode_bsp
>  #endif
>
> -/*
> - * Initialize page tables.  This creates a PDE and a set of page
> - * tables, which are located immediately beyond __brk_base.  The variable
> - * _brk_end is set up to point to the first "safe" location.
> - * Mappings are created both at virtual address 0 (identity mapping)
> - * and PAGE_OFFSET for up to _end.
> - */
> -#ifdef CONFIG_X86_PAE
> -
> -	/*
> -	 * In PAE mode initial_page_table is statically defined to contain
> -	 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
> -	 * entries). The identity mapping is handled by pointing two PGD entries
> -	 * to the first kernel PMD.
> -	 *
> -	 * Note the upper half of each PMD or PTE are always zero at this stage.
> -	 */
> -
> -#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
> -
> -	xorl %ebx,%ebx				/* %ebx is kept at zero */
> -
> -	movl $pa(__brk_base), %edi
> -	movl $pa(initial_pg_pmd), %edx
> -	movl $PTE_IDENT_ATTR, %eax
> -10:
> -	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
> -	movl %ecx,(%edx)			/* Store PMD entry */
> -						/* Upper half already zero */
> -	addl $8,%edx
> -	movl $512,%ecx
> -11:
> -	stosl
> -	xchgl %eax,%ebx
> -	stosl
> -	xchgl %eax,%ebx
> -	addl $0x1000,%eax
> -	loop 11b
> -
> -	/*
> -	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
> -	 */
> -	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
> -	cmpl %ebp,%eax
> -	jb 10b
> -1:
> -	addl $__PAGE_OFFSET, %edi
> -	movl %edi, pa(_brk_end)
> -	shrl $12, %eax
> -	movl %eax, pa(max_pfn_mapped)
> -
> -	/* Do early initialization of the fixmap area */
> -	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
> -	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
> -#else	/* Not PAE */
> -
> -page_pde_offset = (__PAGE_OFFSET >> 20);
> -
> -	movl $pa(__brk_base), %edi
> -	movl $pa(initial_page_table), %edx
> -	movl $PTE_IDENT_ATTR, %eax
> -10:
> -	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
> -	movl %ecx,(%edx)			/* Store identity PDE entry */
> -	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
> -	addl $4,%edx
> -	movl $1024, %ecx
> -11:
> -	stosl
> -	addl $0x1000,%eax
> -	loop 11b
> -	/*
> -	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
> -	 */
> -	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
> -	cmpl %ebp,%eax
> -	jb 10b
> -	addl $__PAGE_OFFSET, %edi
> -	movl %edi, pa(_brk_end)
> -	shrl $12, %eax
> -	movl %eax, pa(max_pfn_mapped)
> -
> -	/* Do early initialization of the fixmap area */
> -	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
> -	movl %eax,pa(initial_page_table+0xffc)
> -#endif
> +	call setup_pgtable_32
>
>  #ifdef CONFIG_PARAVIRT
>  	/* This is can only trip for a broken bootloader... */
> @@ -660,47 +530,11 @@ ENTRY(setup_once_ref)
>   */
>  __PAGE_ALIGNED_BSS
>  	.align PAGE_SIZE
> -#ifdef CONFIG_X86_PAE
> -initial_pg_pmd:
> -	.fill 1024*KPMDS,4,0
> -#else
> -ENTRY(initial_page_table)
> -	.fill 1024,4,0
> -#endif
> -initial_pg_fixmap:
> -	.fill 1024,4,0
>  ENTRY(empty_zero_page)
>  	.fill 4096,1,0
>  ENTRY(swapper_pg_dir)
>  	.fill 1024,4,0
>
> -/*
> - * This starts the data section.
> - */
> -#ifdef CONFIG_X86_PAE
> -__PAGE_ALIGNED_DATA
> -	/* Page-aligned for the benefit of paravirt? */
> -	.align PAGE_SIZE
> -ENTRY(initial_page_table)
> -	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
> -# if KPMDS == 3
> -	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
> -	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
> -	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
> -# elif KPMDS == 2
> -	.long	0,0
> -	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
> -	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
> -# elif KPMDS == 1
> -	.long	0,0
> -	.long	0,0
> -	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
> -# else
> -#  error "Kernel PMDs should be 1, 2 or 3"
> -# endif
> -	.align PAGE_SIZE		/* needs to be page-sized too */
> -#endif
> -
>  .data
>  .balign 4
>  ENTRY(initial_stack)
> diff --git a/arch/x86/kernel/pgtable_32.S b/arch/x86/kernel/pgtable_32.S
> new file mode 100644
> index 0000000..aded718
> --- /dev/null
> +++ b/arch/x86/kernel/pgtable_32.S
> @@ -0,0 +1,196 @@
> +#include <linux/threads.h>
> +#include <linux/init.h>
> +#include <linux/linkage.h>
> +#include <asm/segment.h>
> +#include <asm/page_types.h>
> +#include <asm/pgtable_types.h>
> +#include <asm/cache.h>
> +#include <asm/thread_info.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/setup.h>
> +#include <asm/processor-flags.h>
> +#include <asm/msr-index.h>
> +#include <asm/cpufeatures.h>
> +#include <asm/percpu.h>
> +#include <asm/nops.h>
> +#include <asm/bootparam.h>
> +
> +/* Physical address */
> +#define pa(X) ((X) - __PAGE_OFFSET)
> +
> +/*
> + * This is how much memory in addition to the memory covered up to
> + * and including _end we need mapped initially.
> + * We need:
> + *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
> + *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
> + *
> + * Modulo rounding, each megabyte assigned here requires a kilobyte of
> + * memory, which is currently unreclaimed.
> + *
> + * This should be a multiple of a page.
> + *
> + * KERNEL_IMAGE_SIZE should be greater than pa(_end)
> + * and small than max_low_pfn, otherwise will waste some page table entries
> + */
> +
> +#if PTRS_PER_PMD > 1
> +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
> +#else
> +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
> +#endif
> +
> +/*
> + * Number of possible pages in the lowmem region.
> + *
> + * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
> + * gas warning about overflowing shift count when gas has been compiled
> + * with only a host target support using a 32-bit type for internal
> + * representation.
> + */
> +LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
> +
> +/* Enough space to fit pagetables for the low memory linear map */
> +MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
> +
> +/*
> + * Worst-case size of the kernel mapping we need to make:
> + * a relocatable kernel can live anywhere in lowmem, so we need to be able
> + * to map all of lowmem.
> + */
> +KERNEL_PAGES = LOWMEM_PAGES
> +
> +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
> +RESERVE_BRK(pagetables, INIT_MAP_SIZE)
> +
> +/*
> + * Initialize page tables.  This creates a PDE and a set of page
> + * tables, which are located immediately beyond __brk_base.  The variable
> + * _brk_end is set up to point to the first "safe" location.
> + * Mappings are created both at virtual address 0 (identity mapping)
> + * and PAGE_OFFSET for up to _end.
> + */
> +	.text
> +ENTRY(setup_pgtable_32)
> +#ifdef CONFIG_X86_PAE
> +	/*
> +	 * In PAE mode initial_page_table is statically defined to contain
> +	 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
> +	 * entries). The identity mapping is handled by pointing two PGD entries
> +	 * to the first kernel PMD.
> +	 *
> +	 * Note the upper half of each PMD or PTE are always zero at this stage.
> +	 */
> +
> +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
> +
> +	xorl %ebx,%ebx				/* %ebx is kept at zero */
> +
> +	movl $pa(__brk_base), %edi
> +	movl $pa(initial_pg_pmd), %edx
> +	movl $PTE_IDENT_ATTR, %eax
> +10:
> +	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
> +	movl %ecx,(%edx)			/* Store PMD entry */
> +						/* Upper half already zero */
> +	addl $8,%edx
> +	movl $512,%ecx
> +11:
> +	stosl
> +	xchgl %eax,%ebx
> +	stosl
> +	xchgl %eax,%ebx
> +	addl $0x1000,%eax
> +	loop 11b
> +
> +	/*
> +	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
> +	 */
> +	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
> +	cmpl %ebp,%eax
> +	jb 10b
> +1:
> +	addl $__PAGE_OFFSET, %edi
> +	movl %edi, pa(_brk_end)
> +	shrl $12, %eax
> +	movl %eax, pa(max_pfn_mapped)
> +
> +	/* Do early initialization of the fixmap area */
> +	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
> +	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
> +#else	/* Not PAE */
> +
> +page_pde_offset = (__PAGE_OFFSET >> 20);
> +
> +	movl $pa(__brk_base), %edi
> +	movl $pa(initial_page_table), %edx
> +	movl $PTE_IDENT_ATTR, %eax
> +10:
> +	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
> +	movl %ecx,(%edx)			/* Store identity PDE entry */
> +	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
> +	addl $4,%edx
> +	movl $1024, %ecx
> +11:
> +	stosl
> +	addl $0x1000,%eax
> +	loop 11b
> +	/*
> +	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
> +	 */
> +	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
> +	cmpl %ebp,%eax
> +	jb 10b
> +	addl $__PAGE_OFFSET, %edi
> +	movl %edi, pa(_brk_end)
> +	shrl $12, %eax
> +	movl %eax, pa(max_pfn_mapped)
> +
> +	/* Do early initialization of the fixmap area */
> +	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
> +	movl %eax,pa(initial_page_table+0xffc)
> +#endif
> +	ret
> +ENDPROC(setup_pgtable_32)
> +
> +/*
> + * BSS section
> + */
> +__PAGE_ALIGNED_BSS
> +	.align PAGE_SIZE
> +#ifdef CONFIG_X86_PAE
> +initial_pg_pmd:
> +	.fill 1024*KPMDS,4,0
> +#else
> +ENTRY(initial_page_table)
> +	.fill 1024,4,0
> +#endif
> +initial_pg_fixmap:
> +	.fill 1024,4,0
> +
> +/*
> + * This starts the data section.
> + */
> +#ifdef CONFIG_X86_PAE
> +__PAGE_ALIGNED_DATA
> +	/* Page-aligned for the benefit of paravirt? */
> +	.align PAGE_SIZE
> +ENTRY(initial_page_table)
> +	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
> +# if KPMDS == 3
> +	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
> +	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
> +	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
> +# elif KPMDS == 2
> +	.long	0,0
> +	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
> +	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
> +# elif KPMDS == 1
> +	.long	0,0
> +	.long	0,0
> +	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
> +# else
> +#  error "Kernel PMDs should be 1, 2 or 3"
> +# endif
> +	.align PAGE_SIZE		/* needs to be page-sized too */
> +#endif
>
Boris Ostrovsky Dec. 1, 2016, 3:33 p.m. UTC | #6
On 10/31/2016 08:33 AM, Boris Ostrovsky wrote:
>
>
> On 10/14/2016 02:05 PM, Boris Ostrovsky wrote:
>> From: Matt Fleming <matt@codeblueprint.co.uk>
>>
>> The new Xen PVH entry point requires page tables to be setup by the
>> kernel since it is entered with paging disabled.
>>
>> Pull the common code out of head_32.S and into pgtable_32.S so that
>> setup_pgtable_32 can be invoked from both the new Xen entry point and
>> the existing startup_32 code.
>
>
> Ping to x86 maintainers.

Pinging again.

I will be re-sending this series at some point (it has been delayed by
some hypervisor changes that will be needed) but I'd like to hear from
x86 maintainers whether this will be acceptable before I post this again.

Thanks.
-boris

>
> Peter, you had questions about this patch. Did I answer them?
>
> -boris
>
>
>>
>> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: x86@kernel.org
>> Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
>> ---
>>  arch/x86/Makefile            |   2 +
>>  arch/x86/kernel/Makefile     |   2 +
>>  arch/x86/kernel/head_32.S    | 168
>> +------------------------------------
>>  arch/x86/kernel/pgtable_32.S | 196
>> +++++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 201 insertions(+), 167 deletions(-)
>>  create mode 100644 arch/x86/kernel/pgtable_32.S
>>
>> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
>> index 2d44933..67cc771 100644
>> --- a/arch/x86/Makefile
>> +++ b/arch/x86/Makefile
>> @@ -204,6 +204,8 @@ head-y += arch/x86/kernel/head$(BITS).o
>>  head-y += arch/x86/kernel/ebda.o
>>  head-y += arch/x86/kernel/platform-quirks.o
>>
>> +head-$(CONFIG_X86_32) += arch/x86/kernel/pgtable_32.o
>> +
>>  libs-y  += arch/x86/lib/
>>
>>  # See arch/x86/Kbuild for content of core part of the kernel
>> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
>> index 4dd5d50..eae85a5 100644
>> --- a/arch/x86/kernel/Makefile
>> +++ b/arch/x86/kernel/Makefile
>> @@ -8,6 +8,8 @@ extra-y    += ebda.o
>>  extra-y    += platform-quirks.o
>>  extra-y    += vmlinux.lds
>>
>> +extra-$(CONFIG_X86_32) += pgtable_32.o
>> +
>>  CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
>>
>>  ifdef CONFIG_FUNCTION_TRACER
>> diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
>> index 5f40126..0db066e 100644
>> --- a/arch/x86/kernel/head_32.S
>> +++ b/arch/x86/kernel/head_32.S
>> @@ -41,51 +41,6 @@
>>  #define X86_VENDOR_ID    new_cpu_data+CPUINFO_x86_vendor_id
>>
>>  /*
>> - * This is how much memory in addition to the memory covered up to
>> - * and including _end we need mapped initially.
>> - * We need:
>> - *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
>> - *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
>> - *
>> - * Modulo rounding, each megabyte assigned here requires a kilobyte of
>> - * memory, which is currently unreclaimed.
>> - *
>> - * This should be a multiple of a page.
>> - *
>> - * KERNEL_IMAGE_SIZE should be greater than pa(_end)
>> - * and small than max_low_pfn, otherwise will waste some page table
>> entries
>> - */
>> -
>> -#if PTRS_PER_PMD > 1
>> -#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) +
>> PTRS_PER_PGD)
>> -#else
>> -#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
>> -#endif
>> -
>> -/*
>> - * Number of possible pages in the lowmem region.
>> - *
>> - * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
>> - * gas warning about overflowing shift count when gas has been compiled
>> - * with only a host target support using a 32-bit type for internal
>> - * representation.
>> - */
>> -LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
>> -
>> -/* Enough space to fit pagetables for the low memory linear map */
>> -MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
>> -
>> -/*
>> - * Worst-case size of the kernel mapping we need to make:
>> - * a relocatable kernel can live anywhere in lowmem, so we need to
>> be able
>> - * to map all of lowmem.
>> - */
>> -KERNEL_PAGES = LOWMEM_PAGES
>> -
>> -INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
>> -RESERVE_BRK(pagetables, INIT_MAP_SIZE)
>> -
>> -/*
>>   * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
>>   * %esi points to the real-mode code as a 32-bit pointer.
>>   * CS and DS must be 4 GB flat segments, but we don't depend on
>> @@ -157,92 +112,7 @@ ENTRY(startup_32)
>>      call load_ucode_bsp
>>  #endif
>>
>> -/*
>> - * Initialize page tables.  This creates a PDE and a set of page
>> - * tables, which are located immediately beyond __brk_base.  The
>> variable
>> - * _brk_end is set up to point to the first "safe" location.
>> - * Mappings are created both at virtual address 0 (identity mapping)
>> - * and PAGE_OFFSET for up to _end.
>> - */
>> -#ifdef CONFIG_X86_PAE
>> -
>> -    /*
>> -     * In PAE mode initial_page_table is statically defined to contain
>> -     * enough entries to cover the VMSPLIT option (that is the top
>> 1, 2 or 3
>> -     * entries). The identity mapping is handled by pointing two PGD
>> entries
>> -     * to the first kernel PMD.
>> -     *
>> -     * Note the upper half of each PMD or PTE are always zero at
>> this stage.
>> -     */
>> -
>> -#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel
>> PMDs */
>> -
>> -    xorl %ebx,%ebx                /* %ebx is kept at zero */
>> -
>> -    movl $pa(__brk_base), %edi
>> -    movl $pa(initial_pg_pmd), %edx
>> -    movl $PTE_IDENT_ATTR, %eax
>> -10:
>> -    leal PDE_IDENT_ATTR(%edi),%ecx        /* Create PMD entry */
>> -    movl %ecx,(%edx)            /* Store PMD entry */
>> -                        /* Upper half already zero */
>> -    addl $8,%edx
>> -    movl $512,%ecx
>> -11:
>> -    stosl
>> -    xchgl %eax,%ebx
>> -    stosl
>> -    xchgl %eax,%ebx
>> -    addl $0x1000,%eax
>> -    loop 11b
>> -
>> -    /*
>> -     * End condition: we must map up to the end + MAPPING_BEYOND_END.
>> -     */
>> -    movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>> -    cmpl %ebp,%eax
>> -    jb 10b
>> -1:
>> -    addl $__PAGE_OFFSET, %edi
>> -    movl %edi, pa(_brk_end)
>> -    shrl $12, %eax
>> -    movl %eax, pa(max_pfn_mapped)
>> -
>> -    /* Do early initialization of the fixmap area */
>> -    movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>> -    movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
>> -#else    /* Not PAE */
>> -
>> -page_pde_offset = (__PAGE_OFFSET >> 20);
>> -
>> -    movl $pa(__brk_base), %edi
>> -    movl $pa(initial_page_table), %edx
>> -    movl $PTE_IDENT_ATTR, %eax
>> -10:
>> -    leal PDE_IDENT_ATTR(%edi),%ecx        /* Create PDE entry */
>> -    movl %ecx,(%edx)            /* Store identity PDE entry */
>> -    movl %ecx,page_pde_offset(%edx)        /* Store kernel PDE entry */
>> -    addl $4,%edx
>> -    movl $1024, %ecx
>> -11:
>> -    stosl
>> -    addl $0x1000,%eax
>> -    loop 11b
>> -    /*
>> -     * End condition: we must map up to the end + MAPPING_BEYOND_END.
>> -     */
>> -    movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>> -    cmpl %ebp,%eax
>> -    jb 10b
>> -    addl $__PAGE_OFFSET, %edi
>> -    movl %edi, pa(_brk_end)
>> -    shrl $12, %eax
>> -    movl %eax, pa(max_pfn_mapped)
>> -
>> -    /* Do early initialization of the fixmap area */
>> -    movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>> -    movl %eax,pa(initial_page_table+0xffc)
>> -#endif
>> +    call setup_pgtable_32
>>
>>  #ifdef CONFIG_PARAVIRT
>>      /* This is can only trip for a broken bootloader... */
>> @@ -660,47 +530,11 @@ ENTRY(setup_once_ref)
>>   */
>>  __PAGE_ALIGNED_BSS
>>      .align PAGE_SIZE
>> -#ifdef CONFIG_X86_PAE
>> -initial_pg_pmd:
>> -    .fill 1024*KPMDS,4,0
>> -#else
>> -ENTRY(initial_page_table)
>> -    .fill 1024,4,0
>> -#endif
>> -initial_pg_fixmap:
>> -    .fill 1024,4,0
>>  ENTRY(empty_zero_page)
>>      .fill 4096,1,0
>>  ENTRY(swapper_pg_dir)
>>      .fill 1024,4,0
>>
>> -/*
>> - * This starts the data section.
>> - */
>> -#ifdef CONFIG_X86_PAE
>> -__PAGE_ALIGNED_DATA
>> -    /* Page-aligned for the benefit of paravirt? */
>> -    .align PAGE_SIZE
>> -ENTRY(initial_page_table)
>> -    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR),0    /* low identity
>> map */
>> -# if KPMDS == 3
>> -    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>> -    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>> -    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
>> -# elif KPMDS == 2
>> -    .long    0,0
>> -    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>> -    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>> -# elif KPMDS == 1
>> -    .long    0,0
>> -    .long    0,0
>> -    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>> -# else
>> -#  error "Kernel PMDs should be 1, 2 or 3"
>> -# endif
>> -    .align PAGE_SIZE        /* needs to be page-sized too */
>> -#endif
>> -
>>  .data
>>  .balign 4
>>  ENTRY(initial_stack)
>> diff --git a/arch/x86/kernel/pgtable_32.S b/arch/x86/kernel/pgtable_32.S
>> new file mode 100644
>> index 0000000..aded718
>> --- /dev/null
>> +++ b/arch/x86/kernel/pgtable_32.S
>> @@ -0,0 +1,196 @@
>> +#include <linux/threads.h>
>> +#include <linux/init.h>
>> +#include <linux/linkage.h>
>> +#include <asm/segment.h>
>> +#include <asm/page_types.h>
>> +#include <asm/pgtable_types.h>
>> +#include <asm/cache.h>
>> +#include <asm/thread_info.h>
>> +#include <asm/asm-offsets.h>
>> +#include <asm/setup.h>
>> +#include <asm/processor-flags.h>
>> +#include <asm/msr-index.h>
>> +#include <asm/cpufeatures.h>
>> +#include <asm/percpu.h>
>> +#include <asm/nops.h>
>> +#include <asm/bootparam.h>
>> +
>> +/* Physical address */
>> +#define pa(X) ((X) - __PAGE_OFFSET)
>> +
>> +/*
>> + * This is how much memory in addition to the memory covered up to
>> + * and including _end we need mapped initially.
>> + * We need:
>> + *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
>> + *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
>> + *
>> + * Modulo rounding, each megabyte assigned here requires a kilobyte of
>> + * memory, which is currently unreclaimed.
>> + *
>> + * This should be a multiple of a page.
>> + *
>> + * KERNEL_IMAGE_SIZE should be greater than pa(_end)
>> + * and small than max_low_pfn, otherwise will waste some page table
>> entries
>> + */
>> +
>> +#if PTRS_PER_PMD > 1
>> +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) +
>> PTRS_PER_PGD)
>> +#else
>> +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
>> +#endif
>> +
>> +/*
>> + * Number of possible pages in the lowmem region.
>> + *
>> + * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
>> + * gas warning about overflowing shift count when gas has been compiled
>> + * with only a host target support using a 32-bit type for internal
>> + * representation.
>> + */
>> +LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
>> +
>> +/* Enough space to fit pagetables for the low memory linear map */
>> +MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
>> +
>> +/*
>> + * Worst-case size of the kernel mapping we need to make:
>> + * a relocatable kernel can live anywhere in lowmem, so we need to
>> be able
>> + * to map all of lowmem.
>> + */
>> +KERNEL_PAGES = LOWMEM_PAGES
>> +
>> +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
>> +RESERVE_BRK(pagetables, INIT_MAP_SIZE)
>> +
>> +/*
>> + * Initialize page tables.  This creates a PDE and a set of page
>> + * tables, which are located immediately beyond __brk_base.  The
>> variable
>> + * _brk_end is set up to point to the first "safe" location.
>> + * Mappings are created both at virtual address 0 (identity mapping)
>> + * and PAGE_OFFSET for up to _end.
>> + */
>> +    .text
>> +ENTRY(setup_pgtable_32)
>> +#ifdef CONFIG_X86_PAE
>> +    /*
>> +     * In PAE mode initial_page_table is statically defined to contain
>> +     * enough entries to cover the VMSPLIT option (that is the top
>> 1, 2 or 3
>> +     * entries). The identity mapping is handled by pointing two PGD
>> entries
>> +     * to the first kernel PMD.
>> +     *
>> +     * Note the upper half of each PMD or PTE are always zero at
>> this stage.
>> +     */
>> +
>> +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel
>> PMDs */
>> +
>> +    xorl %ebx,%ebx                /* %ebx is kept at zero */
>> +
>> +    movl $pa(__brk_base), %edi
>> +    movl $pa(initial_pg_pmd), %edx
>> +    movl $PTE_IDENT_ATTR, %eax
>> +10:
>> +    leal PDE_IDENT_ATTR(%edi),%ecx        /* Create PMD entry */
>> +    movl %ecx,(%edx)            /* Store PMD entry */
>> +                        /* Upper half already zero */
>> +    addl $8,%edx
>> +    movl $512,%ecx
>> +11:
>> +    stosl
>> +    xchgl %eax,%ebx
>> +    stosl
>> +    xchgl %eax,%ebx
>> +    addl $0x1000,%eax
>> +    loop 11b
>> +
>> +    /*
>> +     * End condition: we must map up to the end + MAPPING_BEYOND_END.
>> +     */
>> +    movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>> +    cmpl %ebp,%eax
>> +    jb 10b
>> +1:
>> +    addl $__PAGE_OFFSET, %edi
>> +    movl %edi, pa(_brk_end)
>> +    shrl $12, %eax
>> +    movl %eax, pa(max_pfn_mapped)
>> +
>> +    /* Do early initialization of the fixmap area */
>> +    movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>> +    movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
>> +#else    /* Not PAE */
>> +
>> +page_pde_offset = (__PAGE_OFFSET >> 20);
>> +
>> +    movl $pa(__brk_base), %edi
>> +    movl $pa(initial_page_table), %edx
>> +    movl $PTE_IDENT_ATTR, %eax
>> +10:
>> +    leal PDE_IDENT_ATTR(%edi),%ecx        /* Create PDE entry */
>> +    movl %ecx,(%edx)            /* Store identity PDE entry */
>> +    movl %ecx,page_pde_offset(%edx)        /* Store kernel PDE entry */
>> +    addl $4,%edx
>> +    movl $1024, %ecx
>> +11:
>> +    stosl
>> +    addl $0x1000,%eax
>> +    loop 11b
>> +    /*
>> +     * End condition: we must map up to the end + MAPPING_BEYOND_END.
>> +     */
>> +    movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>> +    cmpl %ebp,%eax
>> +    jb 10b
>> +    addl $__PAGE_OFFSET, %edi
>> +    movl %edi, pa(_brk_end)
>> +    shrl $12, %eax
>> +    movl %eax, pa(max_pfn_mapped)
>> +
>> +    /* Do early initialization of the fixmap area */
>> +    movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>> +    movl %eax,pa(initial_page_table+0xffc)
>> +#endif
>> +    ret
>> +ENDPROC(setup_pgtable_32)
>> +
>> +/*
>> + * BSS section
>> + */
>> +__PAGE_ALIGNED_BSS
>> +    .align PAGE_SIZE
>> +#ifdef CONFIG_X86_PAE
>> +initial_pg_pmd:
>> +    .fill 1024*KPMDS,4,0
>> +#else
>> +ENTRY(initial_page_table)
>> +    .fill 1024,4,0
>> +#endif
>> +initial_pg_fixmap:
>> +    .fill 1024,4,0
>> +
>> +/*
>> + * This starts the data section.
>> + */
>> +#ifdef CONFIG_X86_PAE
>> +__PAGE_ALIGNED_DATA
>> +    /* Page-aligned for the benefit of paravirt? */
>> +    .align PAGE_SIZE
>> +ENTRY(initial_page_table)
>> +    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR),0    /* low identity
>> map */
>> +# if KPMDS == 3
>> +    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>> +    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>> +    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
>> +# elif KPMDS == 2
>> +    .long    0,0
>> +    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>> +    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>> +# elif KPMDS == 1
>> +    .long    0,0
>> +    .long    0,0
>> +    .long    pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>> +# else
>> +#  error "Kernel PMDs should be 1, 2 or 3"
>> +# endif
>> +    .align PAGE_SIZE        /* needs to be page-sized too */
>> +#endif
>>
Ingo Molnar Dec. 2, 2016, 9:45 a.m. UTC | #7
* Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:

> On 10/31/2016 08:33 AM, Boris Ostrovsky wrote:
> >
> >
> > On 10/14/2016 02:05 PM, Boris Ostrovsky wrote:
> >> From: Matt Fleming <matt@codeblueprint.co.uk>
> >>
> >> The new Xen PVH entry point requires page tables to be setup by the
> >> kernel since it is entered with paging disabled.
> >>
> >> Pull the common code out of head_32.S and into pgtable_32.S so that
> >> setup_pgtable_32 can be invoked from both the new Xen entry point and
> >> the existing startup_32 code.
> >
> >
> > Ping to x86 maintainers.
> 
> Pinging again.
> 
> I will be re-sending this series at some point (it has been delayed by
> some hypervisor changes that will be needed) but I'd like to hear from
> x86 maintainers whether this will be acceptable before I post this again.

Could this be done in C?

Thanks,

	Ingo
Boris Ostrovsky Dec. 2, 2016, 2:04 p.m. UTC | #8
On 12/02/2016 04:45 AM, Ingo Molnar wrote:
> * Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>
>> On 10/31/2016 08:33 AM, Boris Ostrovsky wrote:
>>>
>>> On 10/14/2016 02:05 PM, Boris Ostrovsky wrote:
>>>> From: Matt Fleming <matt@codeblueprint.co.uk>
>>>>
>>>> The new Xen PVH entry point requires page tables to be setup by the
>>>> kernel since it is entered with paging disabled.
>>>>
>>>> Pull the common code out of head_32.S and into pgtable_32.S so that
>>>> setup_pgtable_32 can be invoked from both the new Xen entry point and
>>>> the existing startup_32 code.
>>>
>>> Ping to x86 maintainers.
>> Pinging again.
>>
>> I will be re-sending this series at some point (it has been delayed by
>> some hypervisor changes that will be needed) but I'd like to hear from
>> x86 maintainers whether this will be acceptable before I post this again.
> Could this be done in C?

I suppose it could be, I haven't thought about it.

The goal here was to simply make existing startup code available to
others (Xen guest) without changes. Are you suggesting to build page
tables in C for the Xen guest only or to make startup_32 call new C code
as well?

-boris
Ingo Molnar Dec. 2, 2016, 4:08 p.m. UTC | #9
* Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:

> On 12/02/2016 04:45 AM, Ingo Molnar wrote:
> > * Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
> >
> >> On 10/31/2016 08:33 AM, Boris Ostrovsky wrote:
> >>>
> >>> On 10/14/2016 02:05 PM, Boris Ostrovsky wrote:
> >>>> From: Matt Fleming <matt@codeblueprint.co.uk>
> >>>>
> >>>> The new Xen PVH entry point requires page tables to be setup by the
> >>>> kernel since it is entered with paging disabled.
> >>>>
> >>>> Pull the common code out of head_32.S and into pgtable_32.S so that
> >>>> setup_pgtable_32 can be invoked from both the new Xen entry point and
> >>>> the existing startup_32 code.
> >>>
> >>> Ping to x86 maintainers.
> >> Pinging again.
> >>
> >> I will be re-sending this series at some point (it has been delayed by
> >> some hypervisor changes that will be needed) but I'd like to hear from
> >> x86 maintainers whether this will be acceptable before I post this again.
> > Could this be done in C?
> 
> I suppose it could be, I haven't thought about it.
> 
> The goal here was to simply make existing startup code available to others (Xen 
> guest) without changes. Are you suggesting to build page tables in C for the Xen 
> guest only or to make startup_32 call new C code as well?

My suggestion would be to transform the factored out assembly code to C.

Thanks,

	Ingo
H. Peter Anvin Dec. 2, 2016, 5:52 p.m. UTC | #10
On December 2, 2016 8:08:55 AM PST, Ingo Molnar <mingo@kernel.org> wrote:
>
>* Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>
>> On 12/02/2016 04:45 AM, Ingo Molnar wrote:
>> > * Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>> >
>> >> On 10/31/2016 08:33 AM, Boris Ostrovsky wrote:
>> >>>
>> >>> On 10/14/2016 02:05 PM, Boris Ostrovsky wrote:
>> >>>> From: Matt Fleming <matt@codeblueprint.co.uk>
>> >>>>
>> >>>> The new Xen PVH entry point requires page tables to be setup by
>the
>> >>>> kernel since it is entered with paging disabled.
>> >>>>
>> >>>> Pull the common code out of head_32.S and into pgtable_32.S so
>that
>> >>>> setup_pgtable_32 can be invoked from both the new Xen entry
>point and
>> >>>> the existing startup_32 code.
>> >>>
>> >>> Ping to x86 maintainers.
>> >> Pinging again.
>> >>
>> >> I will be re-sending this series at some point (it has been
>delayed by
>> >> some hypervisor changes that will be needed) but I'd like to hear
>from
>> >> x86 maintainers whether this will be acceptable before I post this
>again.
>> > Could this be done in C?
>> 
>> I suppose it could be, I haven't thought about it.
>> 
>> The goal here was to simply make existing startup code available to
>others (Xen 
>> guest) without changes. Are you suggesting to build page tables in C
>for the Xen 
>> guest only or to make startup_32 call new C code as well?
>
>My suggestion would be to transform the factored out assembly code to
>C.
>
>Thanks,
>
>	Ingo

It is tricky to do so safely, because at this stage almost nothing of the C execution environment has been set up.
Boris Ostrovsky Dec. 2, 2016, 7:49 p.m. UTC | #11
On 12/02/2016 12:52 PM, hpa@zytor.com wrote:
> On December 2, 2016 8:08:55 AM PST, Ingo Molnar <mingo@kernel.org> wrote:
>> * Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>>
>>> On 12/02/2016 04:45 AM, Ingo Molnar wrote:
>>>> * Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>>>>
>>>>> On 10/31/2016 08:33 AM, Boris Ostrovsky wrote:
>>>>>> On 10/14/2016 02:05 PM, Boris Ostrovsky wrote:
>>>>>>> From: Matt Fleming <matt@codeblueprint.co.uk>
>>>>>>>
>>>>>>> The new Xen PVH entry point requires page tables to be setup by
>> the
>>>>>>> kernel since it is entered with paging disabled.
>>>>>>>
>>>>>>> Pull the common code out of head_32.S and into pgtable_32.S so
>> that
>>>>>>> setup_pgtable_32 can be invoked from both the new Xen entry
>> point and
>>>>>>> the existing startup_32 code.
>>>>>> Ping to x86 maintainers.
>>>>> Pinging again.
>>>>>
>>>>> I will be re-sending this series at some point (it has been
>> delayed by
>>>>> some hypervisor changes that will be needed) but I'd like to hear
>> from
>>>>> x86 maintainers whether this will be acceptable before I post this
>> again.
>>>> Could this be done in C?
>>> I suppose it could be, I haven't thought about it.
>>>
>>> The goal here was to simply make existing startup code available to
>> others (Xen 
>>> guest) without changes. Are you suggesting to build page tables in C
>> for the Xen 
>>> guest only or to make startup_32 call new C code as well?
>> My suggestion would be to transform the factored out assembly code to
>> C.
>>
>> Thanks,
>>
>> 	Ingo
> It is tricky to do so safely, because at this stage almost nothing of the C execution environment has been set up.


I can still give it a try but I'd rather not tie it to this (Xen PVH)
patch series. Which would leave me with two options: either keep what
this patch does, leaving it as assembly (requires your ack), or have Xen
code build the pages on its own.

-boris
Ingo Molnar Dec. 3, 2016, 5:49 a.m. UTC | #12
* Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:

> > It is tricky to do so safely, because at this stage almost nothing of the C 
> > execution environment has been set up.

Yeah - but we do have a fair amount of early C code though.

> I can still give it a try but I'd rather not tie it to this (Xen PVH) patch 
> series. Which would leave me with two options: either keep what this patch does, 
> leaving it as assembly (requires your ack), or have Xen code build the pages on 
> its own.

If you give it a try in a subsequent patch (please Cc: me) then it's OK to me:

  Acked-by: Ingo Molnar <mingo@kernel.org>

Feel free to carry it in the Xen tree.

Thanks,

	Ingo
H. Peter Anvin Dec. 3, 2016, 6:36 a.m. UTC | #13
On December 2, 2016 9:49:50 PM PST, Ingo Molnar <mingo@kernel.org> wrote:
>
>* Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote:
>
>> > It is tricky to do so safely, because at this stage almost nothing
>of the C 
>> > execution environment has been set up.
>
>Yeah - but we do have a fair amount of early C code though.
>
>> I can still give it a try but I'd rather not tie it to this (Xen PVH)
>patch 
>> series. Which would leave me with two options: either keep what this
>patch does, 
>> leaving it as assembly (requires your ack), or have Xen code build
>the pages on 
>> its own.
>
>If you give it a try in a subsequent patch (please Cc: me) then it's OK
>to me:
>
>  Acked-by: Ingo Molnar <mingo@kernel.org>
>
>Feel free to carry it in the Xen tree.
>
>Thanks,
>
>	Ingo

It's true that it is now possible to run pre-paging C code.  It would be so much better if Xen could simply go though the normal code path like any civilized machine.
diff mbox

Patch

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d44933..67cc771 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -204,6 +204,8 @@  head-y += arch/x86/kernel/head$(BITS).o
 head-y += arch/x86/kernel/ebda.o
 head-y += arch/x86/kernel/platform-quirks.o
 
+head-$(CONFIG_X86_32) += arch/x86/kernel/pgtable_32.o
+
 libs-y  += arch/x86/lib/
 
 # See arch/x86/Kbuild for content of core part of the kernel
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4dd5d50..eae85a5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,6 +8,8 @@  extra-y	+= ebda.o
 extra-y	+= platform-quirks.o
 extra-y	+= vmlinux.lds
 
+extra-$(CONFIG_X86_32) += pgtable_32.o
+
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 5f40126..0db066e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -41,51 +41,6 @@ 
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
 /*
- * This is how much memory in addition to the memory covered up to
- * and including _end we need mapped initially.
- * We need:
- *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
- *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- *
- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
- * and small than max_low_pfn, otherwise will waste some page table entries
- */
-
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
-
-/*
- * Number of possible pages in the lowmem region.
- *
- * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
- * gas warning about overflowing shift count when gas has been compiled
- * with only a host target support using a 32-bit type for internal
- * representation.
- */
-LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
-
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
-
-/*
- * Worst-case size of the kernel mapping we need to make:
- * a relocatable kernel can live anywhere in lowmem, so we need to be able
- * to map all of lowmem.
- */
-KERNEL_PAGES = LOWMEM_PAGES
-
-INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
-RESERVE_BRK(pagetables, INIT_MAP_SIZE)
-
-/*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
  * %esi points to the real-mode code as a 32-bit pointer.
  * CS and DS must be 4 GB flat segments, but we don't depend on
@@ -157,92 +112,7 @@  ENTRY(startup_32)
 	call load_ucode_bsp
 #endif
 
-/*
- * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base.  The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- */
-#ifdef CONFIG_X86_PAE
-
-	/*
-	 * In PAE mode initial_page_table is statically defined to contain
-	 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
-	 * entries). The identity mapping is handled by pointing two PGD entries
-	 * to the first kernel PMD.
-	 *
-	 * Note the upper half of each PMD or PTE are always zero at this stage.
-	 */
-
-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
-
-	xorl %ebx,%ebx				/* %ebx is kept at zero */
-
-	movl $pa(__brk_base), %edi
-	movl $pa(initial_pg_pmd), %edx
-	movl $PTE_IDENT_ATTR, %eax
-10:
-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
-	movl %ecx,(%edx)			/* Store PMD entry */
-						/* Upper half already zero */
-	addl $8,%edx
-	movl $512,%ecx
-11:
-	stosl
-	xchgl %eax,%ebx
-	stosl
-	xchgl %eax,%ebx
-	addl $0x1000,%eax
-	loop 11b
-
-	/*
-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
-	 */
-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-	cmpl %ebp,%eax
-	jb 10b
-1:
-	addl $__PAGE_OFFSET, %edi
-	movl %edi, pa(_brk_end)
-	shrl $12, %eax
-	movl %eax, pa(max_pfn_mapped)
-
-	/* Do early initialization of the fixmap area */
-	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
-#else	/* Not PAE */
-
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
-	movl $pa(__brk_base), %edi
-	movl $pa(initial_page_table), %edx
-	movl $PTE_IDENT_ATTR, %eax
-10:
-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
-	movl %ecx,(%edx)			/* Store identity PDE entry */
-	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
-	addl $4,%edx
-	movl $1024, %ecx
-11:
-	stosl
-	addl $0x1000,%eax
-	loop 11b
-	/*
-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
-	 */
-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-	cmpl %ebp,%eax
-	jb 10b
-	addl $__PAGE_OFFSET, %edi
-	movl %edi, pa(_brk_end)
-	shrl $12, %eax
-	movl %eax, pa(max_pfn_mapped)
-
-	/* Do early initialization of the fixmap area */
-	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-	movl %eax,pa(initial_page_table+0xffc)
-#endif
+	call setup_pgtable_32
 
 #ifdef CONFIG_PARAVIRT
 	/* This is can only trip for a broken bootloader... */
@@ -660,47 +530,11 @@  ENTRY(setup_once_ref)
  */
 __PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
-#ifdef CONFIG_X86_PAE
-initial_pg_pmd:
-	.fill 1024*KPMDS,4,0
-#else
-ENTRY(initial_page_table)
-	.fill 1024,4,0
-#endif
-initial_pg_fixmap:
-	.fill 1024,4,0
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
 ENTRY(swapper_pg_dir)
 	.fill 1024,4,0
 
-/*
- * This starts the data section.
- */
-#ifdef CONFIG_X86_PAE
-__PAGE_ALIGNED_DATA
-	/* Page-aligned for the benefit of paravirt? */
-	.align PAGE_SIZE
-ENTRY(initial_page_table)
-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
-# if KPMDS == 3
-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
-# elif KPMDS == 2
-	.long	0,0
-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
-# elif KPMDS == 1
-	.long	0,0
-	.long	0,0
-	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
-# else
-#  error "Kernel PMDs should be 1, 2 or 3"
-# endif
-	.align PAGE_SIZE		/* needs to be page-sized too */
-#endif
-
 .data
 .balign 4
 ENTRY(initial_stack)
diff --git a/arch/x86/kernel/pgtable_32.S b/arch/x86/kernel/pgtable_32.S
new file mode 100644
index 0000000..aded718
--- /dev/null
+++ b/arch/x86/kernel/pgtable_32.S
@@ -0,0 +1,196 @@ 
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/setup.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeatures.h>
+#include <asm/percpu.h>
+#include <asm/nops.h>
+#include <asm/bootparam.h>
+
+/* Physical address */
+#define pa(X) ((X) - __PAGE_OFFSET)
+
+/*
+ * This is how much memory in addition to the memory covered up to
+ * and including _end we need mapped initially.
+ * We need:
+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
+ *
+ * Modulo rounding, each megabyte assigned here requires a kilobyte of
+ * memory, which is currently unreclaimed.
+ *
+ * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
+ */
+
+#if PTRS_PER_PMD > 1
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
+#else
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
+#endif
+
+/*
+ * Number of possible pages in the lowmem region.
+ *
+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
+ * gas warning about overflowing shift count when gas has been compiled
+ * with only a host target support using a 32-bit type for internal
+ * representation.
+ */
+LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
+
+/* Enough space to fit pagetables for the low memory linear map */
+MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
+
+/*
+ * Worst-case size of the kernel mapping we need to make:
+ * a relocatable kernel can live anywhere in lowmem, so we need to be able
+ * to map all of lowmem.
+ */
+KERNEL_PAGES = LOWMEM_PAGES
+
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
+
+/*
+ * Initialize page tables.  This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base.  The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ */
+	.text
+ENTRY(setup_pgtable_32)
+#ifdef CONFIG_X86_PAE
+	/*
+	 * In PAE mode initial_page_table is statically defined to contain
+	 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+	 * entries). The identity mapping is handled by pointing two PGD entries
+	 * to the first kernel PMD.
+	 *
+	 * Note the upper half of each PMD or PTE are always zero at this stage.
+	 */
+
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
+
+	xorl %ebx,%ebx				/* %ebx is kept at zero */
+
+	movl $pa(__brk_base), %edi
+	movl $pa(initial_pg_pmd), %edx
+	movl $PTE_IDENT_ATTR, %eax
+10:
+	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
+	movl %ecx,(%edx)			/* Store PMD entry */
+						/* Upper half already zero */
+	addl $8,%edx
+	movl $512,%ecx
+11:
+	stosl
+	xchgl %eax,%ebx
+	stosl
+	xchgl %eax,%ebx
+	addl $0x1000,%eax
+	loop 11b
+
+	/*
+	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
+	 */
+	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
+	cmpl %ebp,%eax
+	jb 10b
+1:
+	addl $__PAGE_OFFSET, %edi
+	movl %edi, pa(_brk_end)
+	shrl $12, %eax
+	movl %eax, pa(max_pfn_mapped)
+
+	/* Do early initialization of the fixmap area */
+	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
+#else	/* Not PAE */
+
+page_pde_offset = (__PAGE_OFFSET >> 20);
+
+	movl $pa(__brk_base), %edi
+	movl $pa(initial_page_table), %edx
+	movl $PTE_IDENT_ATTR, %eax
+10:
+	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
+	movl %ecx,(%edx)			/* Store identity PDE entry */
+	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
+	addl $4,%edx
+	movl $1024, %ecx
+11:
+	stosl
+	addl $0x1000,%eax
+	loop 11b
+	/*
+	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
+	 */
+	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
+	cmpl %ebp,%eax
+	jb 10b
+	addl $__PAGE_OFFSET, %edi
+	movl %edi, pa(_brk_end)
+	shrl $12, %eax
+	movl %eax, pa(max_pfn_mapped)
+
+	/* Do early initialization of the fixmap area */
+	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+	movl %eax,pa(initial_page_table+0xffc)
+#endif
+	ret
+ENDPROC(setup_pgtable_32)
+
+/*
+ * BSS section
+ */
+__PAGE_ALIGNED_BSS
+	.align PAGE_SIZE
+#ifdef CONFIG_X86_PAE
+initial_pg_pmd:
+	.fill 1024*KPMDS,4,0
+#else
+ENTRY(initial_page_table)
+	.fill 1024,4,0
+#endif
+initial_pg_fixmap:
+	.fill 1024,4,0
+
+/*
+ * This starts the data section.
+ */
+#ifdef CONFIG_X86_PAE
+__PAGE_ALIGNED_DATA
+	/* Page-aligned for the benefit of paravirt? */
+	.align PAGE_SIZE
+ENTRY(initial_page_table)
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
+# if KPMDS == 3
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+# elif KPMDS == 2
+	.long	0,0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+# elif KPMDS == 1
+	.long	0,0
+	.long	0,0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+# else
+#  error "Kernel PMDs should be 1, 2 or 3"
+# endif
+	.align PAGE_SIZE		/* needs to be page-sized too */
+#endif