diff mbox

[v1,04/12] xen/hvmlite: Bootstrap HVMlite guest

Message ID 1453498558-6028-5-git-send-email-boris.ostrovsky@oracle.com
State New, archived
Headers show

Commit Message

Boris Ostrovsky Jan. 22, 2016, 9:35 p.m. UTC
Start HVMlite guest XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall
page, initialize boot_params, enable early page tables.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/Makefile      |    1 +
 arch/x86/xen/enlighten.c   |   91 +++++++++++++++++++++++++-
 arch/x86/xen/xen-hvmlite.S |  158 ++++++++++++++++++++++++++++++++++++++++++++
 include/xen/xen.h          |    6 ++
 4 files changed, 255 insertions(+), 1 deletions(-)
 create mode 100644 arch/x86/xen/xen-hvmlite.S

Comments

Luis Chamberlain Jan. 22, 2016, 11:32 p.m. UTC | #1
On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote:
> Start HVMlite guest XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall
> page, initialize boot_params, enable early page tables.
> 
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> ---
>  arch/x86/xen/Makefile      |    1 +
>  arch/x86/xen/enlighten.c   |   91 +++++++++++++++++++++++++-
>  arch/x86/xen/xen-hvmlite.S |  158 ++++++++++++++++++++++++++++++++++++++++++++
>  include/xen/xen.h          |    6 ++
>  4 files changed, 255 insertions(+), 1 deletions(-)
>  create mode 100644 arch/x86/xen/xen-hvmlite.S
> 
> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
> index e47e527..1d913d7 100644
> --- a/arch/x86/xen/Makefile
> +++ b/arch/x86/xen/Makefile
> @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
>  obj-$(CONFIG_XEN_DOM0)		+= vga.o
>  obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o
>  obj-$(CONFIG_XEN_EFI)		+= efi.o
> +obj-$(CONFIG_XEN_PVHVM) 	+= xen-hvmlite.o
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 2cf446a..2ed8b2b 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -118,7 +118,8 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
>   */
>  DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
>  
> -enum xen_domain_type xen_domain_type = XEN_NATIVE;
> +enum xen_domain_type xen_domain_type
> +	__attribute__((section(".data"))) = XEN_NATIVE;
>  EXPORT_SYMBOL_GPL(xen_domain_type);

But why? This is not explained.

>  
>  unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
> @@ -171,6 +172,17 @@ struct tls_descs {
>   */
>  static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
>  
> +#ifdef CONFIG_XEN_PVHVM
> +/*
> + * HVMlite variables. These need to live in data segment since they are
> + * initialized before startup_{32|64}, which clear .bss, are invoked.
> + */

I guess for this reason.. Perhaps a bit more clear:

/*
 * HVMlite variables. These need to live in data segment since they are 
 * before before startup_{32|64} is invoked, otherwise they would be cleared.
 */

> +int xen_hvmlite __attribute__((section(".data"))) = 0;
> +struct hvm_start_info hvmlite_start_info __attribute__((section(".data")));
> +uint hvmlite_start_info_sz = sizeof(hvmlite_start_info);
> +struct boot_params xen_hvmlite_boot_params __attribute__((section(".data")));
> +#endif
> +
>  static void clamp_max_cpus(void)
>  {
>  #ifdef CONFIG_SMP
> @@ -1736,6 +1748,83 @@ asmlinkage __visible void __init xen_start_kernel(void)
>  #endif
>  }
>  
> +#ifdef CONFIG_XEN_PVHVM
> +static void __init hvmlite_bootparams(void)
> +{

Hrm.

> +	struct xen_memory_map memmap;
> +	int i;
> +
> +	memset(&xen_hvmlite_boot_params, 0, sizeof(xen_hvmlite_boot_params));
> +
> +	memmap.nr_entries = ARRAY_SIZE(xen_hvmlite_boot_params.e820_map);
> +	set_xen_guest_handle(memmap.buffer, xen_hvmlite_boot_params.e820_map);
> +	if (HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) {
> +		xen_raw_console_write("XENMEM_memory_map failed\n");
> +		BUG();
> +	}
> +
> +	xen_hvmlite_boot_params.e820_map[memmap.nr_entries].addr =
> +		ISA_START_ADDRESS;
> +	xen_hvmlite_boot_params.e820_map[memmap.nr_entries].size =
> +		ISA_END_ADDRESS - ISA_START_ADDRESS;
> +	xen_hvmlite_boot_params.e820_map[memmap.nr_entries++].type =
> +		E820_RESERVED;
> +
> +	sanitize_e820_map(xen_hvmlite_boot_params.e820_map,
> +			  ARRAY_SIZE(xen_hvmlite_boot_params.e820_map),
> +			  &memmap.nr_entries);
> +
> +	xen_hvmlite_boot_params.e820_entries = memmap.nr_entries;
> +	for (i = 0; i < xen_hvmlite_boot_params.e820_entries; i++)
> +		e820_add_region(xen_hvmlite_boot_params.e820_map[i].addr,
> +				xen_hvmlite_boot_params.e820_map[i].size,
> +				xen_hvmlite_boot_params.e820_map[i].type);
> +
> +	xen_hvmlite_boot_params.hdr.cmd_line_ptr =
> +		hvmlite_start_info.cmdline_paddr;
> +
> +	/* The first module is always ramdisk */
> +	if (hvmlite_start_info.nr_modules) {
> +		struct hvm_modlist_entry *modaddr =
> +			__va(hvmlite_start_info.modlist_paddr);
> +		xen_hvmlite_boot_params.hdr.ramdisk_image = modaddr->paddr;
> +		xen_hvmlite_boot_params.hdr.ramdisk_size = modaddr->size;
> +	}
> +
> +	/*
> +	 * See Documentation/x86/boot.txt.
> +	 *
> +	 * Version 2.12 supports Xen entry point but we will use default x86/PC
> +	 * environment (i.e. hardware_subarch 0).
> +	 */
> +	xen_hvmlite_boot_params.hdr.version = 0x212;
> +	xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */
> +}

I realize PV got away with setting up boot_params on C code but best
ask now that this new code is being introduced: why can't we just have
the Xen hypervisor fill this in? It'd save us all this code.

On this page I show the difference, as an example of what this would look
like in contrast to how lguest set things up as an example in a very
clean way:

http://www.do-not-panic.com/2015/12/xen-and-x86-linux-zero-page.html

> +/*
> + * This routine (and those that it might call) should not use
> + * anything that lives in .bss since that segment will be cleared later
> + */
> +void __init xen_prepare_hvmlite(void)
> +{
> +	u32 eax, ecx, edx, msr;
> +	u64 pfn;
> +
> +	cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx);
> +	pfn = __pa(hypercall_page);
> +	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +
> +	pv_info.name = "Xen HVMlite";
> +	xen_domain_type = XEN_HVM_DOMAIN;
> +	xen_hvmlite = 1;
> +
> +	x86_init.oem.arch_setup = xen_init_kernel;
> +	x86_init.oem.banner = xen_banner;
> +
> +	hvmlite_bootparams();
> +}
> +#endif

If the boot_params.hdr.hardware_subarch_data pointed to a custom
struct then the first C entry point for Xen could shuffle this and
set this too, by still using less asm entry helpers. We'd still
need this run but with the linker table I think we could have
a stub small stub for hvm run, it would not be a call from asm.

> +
>  void __ref xen_hvm_init_shared_info(void)
>  {
>  	int cpu;
> diff --git a/arch/x86/xen/xen-hvmlite.S b/arch/x86/xen/xen-hvmlite.S
> new file mode 100644
> index 0000000..90f03d0
> --- /dev/null
> +++ b/arch/x86/xen/xen-hvmlite.S
> @@ -0,0 +1,158 @@
> +/*
> + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +	.code32
> +	.text
> +#define _pa(x)          ((x) - __START_KERNEL_map)
> +
> +#include <linux/elfnote.h>
> +#include <linux/init.h>
> +#include <linux/linkage.h>
> +#include <asm/segment.h>
> +#include <asm/asm.h>
> +#include <asm/boot.h>
> +#include <asm/processor-flags.h>
> +#include <asm/msr.h>
> +#include <xen/interface/elfnote.h>
> +
> +	__HEAD
> +	.code32
> +
> +/* Entry point for HVMlite guests */
> +ENTRY(hvmlite_start_xen)

Yet another asm entry point for Linux, could it perhaps someway be possible to
just share some of this asm entry code in a clean way ?

> +	cli
> +	cld
> +
> +	mov $_pa(gdt), %eax
> +	lgdt (%eax)
> +
> +	movl $(__BOOT_DS),%eax
> +	movl %eax,%ds
> +	movl %eax,%es
> +	movl %eax,%ss
> +
> +	/* Stash hvm_start_info */
> +	mov $_pa(hvmlite_start_info), %edi
> +	mov %ebx, %esi
> +	mov $_pa(hvmlite_start_info_sz), %ecx
> +	mov (%ecx), %ecx
> +	rep
> +	movsb
> +
> +	movl $_pa(early_stack_end), %eax
> +	movl %eax, %esp
> +
> +	/* Enable PAE mode */
> +	movl %cr4, %eax
> +	orl $X86_CR4_PAE, %eax
> +	movl %eax, %cr4
> +
> +#ifdef CONFIG_X86_64
> +	/* Enable Long mode */
> +	movl $MSR_EFER, %ecx
> +	rdmsr
> +	btsl $_EFER_LME, %eax
> +	wrmsr
> +
> +	/* Enable pre-constructed page tables */
> +	mov $_pa(init_level4_pgt), %eax
> +	movl %eax, %cr3
> +	movl $(X86_CR0_PG | X86_CR0_PE), %eax
> +	movl %eax, %cr0
> +
> +	/* Jump to 64-bit mode. */
> +	pushl $__KERNEL_CS
> +	leal _pa(1f), %eax
> +	pushl %eax
> +	lret
> +
> +	/* 64-bit entry point */
> +	.code64
> +1:
> +	call xen_prepare_hvmlite
> +
> +	/* startup_64 expects boot_params in %rsi */
> +	mov $_pa(xen_hvmlite_boot_params), %rsi
> +	movq $_pa(startup_64), %rax

Nice! But again why can't the Xen hypervisor just set the boot_params?
All other Linux loaders do it. Why is Xen special?

  Luis

> +	jmp *%rax
> +
> +#else /* CONFIG_X86_64 */
> +
> +	/* Use initial_page table and set level 2 to map 2M pages */
> +	movl $_pa(initial_pg_pmd), %edi
> +	movl $(_PAGE_PSE | _PAGE_RW | _PAGE_PRESENT), %eax
> +	movl $2048, %ecx
> +2:
> +	movl %eax, 0(%edi)
> +	addl $0x00200000, %eax
> +	addl $8, %edi
> +	decl %ecx
> +	jnz 2b
> +
> +	/* Enable the boot paging */
> +	movl $_pa(initial_page_table), %eax
> +	movl %eax, %cr3
> +	movl %cr0, %eax
> +	orl $(X86_CR0_PG | X86_CR0_PE), %eax
> +	movl %eax, %cr0
> +
> +	ljmp $__BOOT_CS,$3f
> +3:
> +	call xen_prepare_hvmlite
> +	mov $_pa(xen_hvmlite_boot_params), %esi
> +
> +	/* startup_32 doesn't expect paging and PAE to be on */
> +	ljmp $__BOOT_CS,$_pa(4f)
> +4:
> +	movl %cr0, %eax
> +	andl $~X86_CR0_PG, %eax
> +	movl %eax, %cr0
> +	movl %cr4, %eax
> +	andl $~X86_CR4_PAE, %eax
> +	movl %eax, %cr4
> +
> +	/* Restore initial_pg_pmd to its original (zero) state */
> +	movl $_pa(initial_pg_pmd), %edi
> +	xorl %eax, %eax
> +	movl $(PAGE_SIZE/4), %ecx
> +	rep stosl
> +
> +	ljmp    $0x10, $_pa(startup_32)
> +#endif
> +
> +	.data
> +gdt:
> +	.word	gdt_end - gdt
> +	.long	_pa(gdt)
> +	.word	0
> +	.quad	0x0000000000000000 /* NULL descriptor */
> +#ifdef CONFIG_X86_64
> +	.quad	0x00af9a000000ffff /* __KERNEL_CS */
> +#else
> +	.quad	0x00cf9a000000ffff /* __KERNEL_CS */
> +#endif
> +	.quad	0x00cf92000000ffff /* __KERNEL_DS */
> +gdt_end:
> +
> +	.bss
> +	.balign 4
> +early_stack:
> +	.fill 16, 1, 0
> +early_stack_end:
> +
> +	ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
> +	             _ASM_PTR (hvmlite_start_xen - __START_KERNEL_map))
> diff --git a/include/xen/xen.h b/include/xen/xen.h
> index 0c0e3ef..6a0d3f3 100644
> --- a/include/xen/xen.h
> +++ b/include/xen/xen.h
> @@ -29,6 +29,12 @@ extern enum xen_domain_type xen_domain_type;
>  #define xen_initial_domain()	(0)
>  #endif	/* CONFIG_XEN_DOM0 */
>  
> +#ifdef CONFIG_XEN_PVHVM
> +extern int xen_hvmlite;
> +#else
> +#define xen_hvmlite		(0)
> +#endif
> +
>  #ifdef CONFIG_XEN_PVH
>  /* This functionality exists only for x86. The XEN_PVHVM support exists
>   * only in x86 world - hence on ARM it will be always disabled.
> -- 
> 1.7.1
> 
>
Andrew Cooper Jan. 23, 2016, 12:30 a.m. UTC | #2
On 22/01/2016 23:32, Luis R. Rodriguez wrote:
> On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote:
>> +	/*
>> +	 * See Documentation/x86/boot.txt.
>> +	 *
>> +	 * Version 2.12 supports Xen entry point but we will use default x86/PC
>> +	 * environment (i.e. hardware_subarch 0).
>> +	 */
>> +	xen_hvmlite_boot_params.hdr.version = 0x212;
>> +	xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */
>> +}
> I realize PV got away with setting up boot_params on C code but best
> ask now that this new code is being introduced: why can't we just have
> the Xen hypervisor fill this in? It'd save us all this code.

I agree that this looks to be a mess.  Having said that, the DMLite boot
protocol is OS agnostic, and will be staying that way.

It happens to look suspiciously like multiboot; a flat 32bit protected
mode entry (at a location chosen in an ELF note), with %ebx pointing to
an in-ram structure containing things like a command line and module list.

I would have though the correct way to do direct Linux support would be
to have a very small init stub which constructs an appropriate zero
page, and lets the native entry point get on with things.

This covers the usecase where people wish to boot a specific Linux
kernel straight out of the dom0 filesystem.

For the alternative usecase of general OS support, dom0 would boot
something such as grub2 as the DMLite "kernel", at which point all
stooging around in the guests filesystem is done from guest context,
rather than control context (mitigating a substantial attack surface).

~Andrew
Luis Chamberlain Jan. 23, 2016, 12:45 a.m. UTC | #3
On Fri, Jan 22, 2016 at 4:30 PM, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> I would have though the correct way to do direct Linux support would be
> to have a very small init stub which constructs an appropriate zero
> page, and lets the native entry point get on with things.

As hpa noted recently in another thread [0] that is precisely what
hardware_subarch and hardware_subarch_data was meant to be used for,
and its what I'm alluding to.

The only thing though is that as far as we're concerned on x86 we had
expected use of hardware_subarch and hardware_subarch_data only for
PV, and not for HVM. This seems to be HVM related, but I think this is
just a rebranding of PVH to HVMLite, right, so I think the use case of
hardware_subarch and hardware_subarch_data are still welcomed as
expected in the original design.

[0] http://lkml.kernel.org/r/56A130B5.8060701@zytor.com

 Luis
Luis Chamberlain Jan. 23, 2016, 12:55 a.m. UTC | #4
On Fri, Jan 22, 2016 at 4:30 PM, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> the DMLite boot
> protocol is OS agnostic, and will be staying that way.

What's the DMLite boot protocol? Is that the protocol that is defined
by Xen to boot Xen guests and dom0? Is this well documented somewhere?

To be clear are you saying that by no means will Xen change to instead
of setting a, say zero-page, it would just want to always stuff a xen
struct, pass that to the boot entry, and then expect always the guest
kernel to always parse this?

If true, then by no means, no matter how hard we try, and no matter
what we do on the Linux front to help clean things up will we be able
to have a unified bare metal / Xen entry. I'm noting it could be
possible though provided we do just set the zero page, the subarch to
Xen and subarch_data to the Xen custom data structure.

 Luis
Andrew Cooper Jan. 23, 2016, 2:49 p.m. UTC | #5
On 23/01/2016 00:55, Luis R. Rodriguez wrote:
> On Fri, Jan 22, 2016 at 4:30 PM, Andrew Cooper
> <andrew.cooper3@citrix.com> wrote:
>> the DMLite boot
>> protocol is OS agnostic, and will be staying that way.
> What's the DMLite boot protocol?

It is  a statement of the initial state of a DMLite container.

> Is that the protocol that is defined by Xen to boot Xen guests and dom0?

Technically it is toolstack which constructs this initial state, but
broadly yes.

>  Is this well documented somewhere?

There is a patch out on the list formalising the ABI in writing. (Roger:
ping?)

> To be clear are you saying that by no means will Xen change to instead
> of setting a, say zero-page, it would just want to always stuff a xen
> struct, pass that to the boot entry, and then expect always the guest
> kernel to always parse this?

Correct.  Why do you think we should lumber non-Linux guests with a
Linux-specific boot protocol?

Quite apart from the fact that Linux is second to the table here
(FreeBSD was first), it causes inappropriate linkage between the
toolstack and the version of Linux wishing to be booted.

>
> If true, then by no means, no matter how hard we try, and no matter
> what we do on the Linux front to help clean things up will we be able
> to have a unified bare metal / Xen entry. I'm noting it could be
> possible though provided we do just set the zero page, the subarch to
> Xen and subarch_data to the Xen custom data structure.

All you need is a very small stub which starts per the DMLite ABI, sets
up an appropriate zero_page, and jumps to the native entrypoint.

However, this stub belongs in Linux, not in the Xen toolstack.  That
way, when the Linux boot protocol is modified, both sides can be updated
accordingly.

~Andrew
Konrad Rzeszutek Wilk Jan. 23, 2016, 3:34 p.m. UTC | #6
>However, this stub belongs in Linux, not in the Xen toolstack.  That
>way, when the Linux boot protocol is modified, both sides can be
>updated
>accordingly.

I would add that this idea is borrowed from the EFI stub code that Linux has which also constructs the boot parameter structure when invoked (either from firmware or from EFI shell).
H. Peter Anvin Jan. 23, 2016, 4:01 p.m. UTC | #7
On January 23, 2016 7:34:33 AM PST, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
>
>>However, this stub belongs in Linux, not in the Xen toolstack.  That
>>way, when the Linux boot protocol is modified, both sides can be
>>updated
>>accordingly.
>
>I would add that this idea is borrowed from the EFI stub code that
>Linux has which also constructs the boot parameter structure when
>invoked (either from firmware or from EFI shell).

There is a huge difference though: EFI is a widely used multivendor industry standard.  You are taking about something Xen-specific, and which in good Xen tradition isn't even documented, apparently (did we ever get documentation for the hypervisor ABI?)

Asking "why burden Xen with something Linux-specific" is a pretty extreme case of the tail wagging the dog.

That being said, before any code can be put anywhere, it needs to be written.  We can argue where to put it later.  We went through this process with the EFI stub, too: a standalone implementation (efilinux) first.
Konrad Rzeszutek Wilk Jan. 23, 2016, 4:12 p.m. UTC | #8
On January 23, 2016 11:01:06 AM EST, "H. Peter Anvin" <hpa@zytor.com> wrote:
>On January 23, 2016 7:34:33 AM PST, Konrad Rzeszutek Wilk
><konrad.wilk@oracle.com> wrote:
>>
>>>However, this stub belongs in Linux, not in the Xen toolstack.  That
>>>way, when the Linux boot protocol is modified, both sides can be
>>>updated
>>>accordingly.
>>
>>I would add that this idea is borrowed from the EFI stub code that
>>Linux has which also constructs the boot parameter structure when
>>invoked (either from firmware or from EFI shell).
>
>There is a huge difference though: EFI is a widely used multivendor
>industry standard.  You are taking about something Xen-specific, and
>which in good Xen tradition isn't even documented, apparently (did we
>ever get documentation for the hypervisor ABI?)
>
>Asking "why burden Xen with something Linux-specific" is a pretty
>extreme case of the tail wagging the dog.
>
>That being said, before any code can be put anywhere, it needs to be
>written.  We can argue where to put it later.  We went through this
>process with the EFI stub, too: a standalone implementation (efilinux)
>first.

http://lists.xenproject.org/archives/html/xen-devel/2015-12/msg01793.html

I believe is the latest version. Roger (CCed) has probably an updated one.
H. Peter Anvin Jan. 23, 2016, 6:28 p.m. UTC | #9
On January 23, 2016 8:12:23 AM PST, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
>On January 23, 2016 11:01:06 AM EST, "H. Peter Anvin" <hpa@zytor.com>
>wrote:
>>On January 23, 2016 7:34:33 AM PST, Konrad Rzeszutek Wilk
>><konrad.wilk@oracle.com> wrote:
>>>
>>>>However, this stub belongs in Linux, not in the Xen toolstack.  That
>>>>way, when the Linux boot protocol is modified, both sides can be
>>>>updated
>>>>accordingly.
>>>
>>>I would add that this idea is borrowed from the EFI stub code that
>>>Linux has which also constructs the boot parameter structure when
>>>invoked (either from firmware or from EFI shell).
>>
>>There is a huge difference though: EFI is a widely used multivendor
>>industry standard.  You are taking about something Xen-specific, and
>>which in good Xen tradition isn't even documented, apparently (did we
>>ever get documentation for the hypervisor ABI?)
>>
>>Asking "why burden Xen with something Linux-specific" is a pretty
>>extreme case of the tail wagging the dog.
>>
>>That being said, before any code can be put anywhere, it needs to be
>>written.  We can argue where to put it later.  We went through this
>>process with the EFI stub, too: a standalone implementation (efilinux)
>>first.
>
>http://lists.xenproject.org/archives/html/xen-devel/2015-12/msg01793.html
>
>I believe is the latest version. Roger (CCed) has probably an updated
>one.

I suspect you should write a noninteractive bootloader as a reference implementation, and then consider porting Grub2 and maybe Syslinux to your ABI for those that want a full featured interactive bootloader compatible with the normal management.
Roger Pau Monné Jan. 25, 2016, 10:30 a.m. UTC | #10
El 23/01/16 a les 17.12, Konrad Rzeszutek Wilk ha escrit:
> On January 23, 2016 11:01:06 AM EST, "H. Peter Anvin" <hpa@zytor.com> wrote:
>> On January 23, 2016 7:34:33 AM PST, Konrad Rzeszutek Wilk
>> <konrad.wilk@oracle.com> wrote:
>>>
>>>> However, this stub belongs in Linux, not in the Xen toolstack.  That
>>>> way, when the Linux boot protocol is modified, both sides can be
>>>> updated
>>>> accordingly.
>>>
>>> I would add that this idea is borrowed from the EFI stub code that
>>> Linux has which also constructs the boot parameter structure when
>>> invoked (either from firmware or from EFI shell).
>>
>> There is a huge difference though: EFI is a widely used multivendor
>> industry standard.  You are taking about something Xen-specific, and
>> which in good Xen tradition isn't even documented, apparently (did we
>> ever get documentation for the hypervisor ABI?)
>>
>> Asking "why burden Xen with something Linux-specific" is a pretty
>> extreme case of the tail wagging the dog.
>>
>> That being said, before any code can be put anywhere, it needs to be
>> written.  We can argue where to put it later.  We went through this
>> process with the EFI stub, too: a standalone implementation (efilinux)
>> first.
> 
> http://lists.xenproject.org/archives/html/xen-devel/2015-12/msg01793.html
> 
> I believe is the latest version. Roger (CCed) has probably an updated one.

Yes, the technical side has not changed at all. I've just send an
updated version with some wording changes:

http://lists.xenproject.org/archives/html/xen-devel/2016-01/msg03029.html

Roger.
Boris Ostrovsky Jan. 25, 2016, 3:08 p.m. UTC | #11
On 01/22/2016 07:30 PM, Andrew Cooper wrote:
> On 22/01/2016 23:32, Luis R. Rodriguez wrote:
>> On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote:
>>> +	/*
>>> +	 * See Documentation/x86/boot.txt.
>>> +	 *
>>> +	 * Version 2.12 supports Xen entry point but we will use default x86/PC
>>> +	 * environment (i.e. hardware_subarch 0).
>>> +	 */
>>> +	xen_hvmlite_boot_params.hdr.version = 0x212;
>>> +	xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */
>>> +}
>> I realize PV got away with setting up boot_params on C code but best
>> ask now that this new code is being introduced: why can't we just have
>> the Xen hypervisor fill this in? It'd save us all this code.
> I agree that this looks to be a mess.  Having said that, the DMLite boot
> protocol is OS agnostic, and will be staying that way.
>
> It happens to look suspiciously like multiboot; a flat 32bit protected
> mode entry (at a location chosen in an ELF note), with %ebx pointing to
> an in-ram structure containing things like a command line and module list.
>
> I would have though the correct way to do direct Linux support would be
> to have a very small init stub which constructs an appropriate zero
> page, and lets the native entry point get on with things.

Which is really what 
hvmlite_start_xen()->xen_prepare_hvmlite()->hvmlite_bootparams() is 
doing. Not much more than that (for 64-bit it also loads identity 
mapping because that's what startup_64 wants)

-boris

>
> This covers the usecase where people wish to boot a specific Linux
> kernel straight out of the dom0 filesystem.
>
> For the alternative usecase of general OS support, dom0 would boot
> something such as grub2 as the DMLite "kernel", at which point all
> stooging around in the guests filesystem is done from guest context,
> rather than control context (mitigating a substantial attack surface).
>
> ~Andrew
Boris Ostrovsky Jan. 25, 2016, 4:08 p.m. UTC | #12
On 01/22/2016 06:32 PM, Luis R. Rodriguez wrote:
> On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote:
>
>> +/*
>> + * This routine (and those that it might call) should not use
>> + * anything that lives in .bss since that segment will be cleared later
>> + */
>> +void __init xen_prepare_hvmlite(void)
>> +{
>> +	u32 eax, ecx, edx, msr;
>> +	u64 pfn;
>> +
>> +	cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx);
>> +	pfn = __pa(hypercall_page);
>> +	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
>> +
>> +	pv_info.name = "Xen HVMlite";
>> +	xen_domain_type = XEN_HVM_DOMAIN;
>> +	xen_hvmlite = 1;
>> +
>> +	x86_init.oem.arch_setup = xen_init_kernel;
>> +	x86_init.oem.banner = xen_banner;
>> +
>> +	hvmlite_bootparams();
>> +}
>> +#endif
> If the boot_params.hdr.hardware_subarch_data pointed to a custom
> struct then the first C entry point for Xen could shuffle this and
> set this too, by still using less asm entry helpers. We'd still
> need this run but with the linker table I think we could have
> a stub small stub for hvm run, it would not be a call from asm.

Perhaps, but someone would still have to set hardware_subarch. And it's 
hvmlite_bootparams() that does it.

And that's not sufficient, I think. There are still some things that 
trampoline code sets up (e.g. page tables for 64-bit).

-boris
Luis Chamberlain Jan. 25, 2016, 9:12 p.m. UTC | #13
On Mon, Jan 25, 2016 at 11:08:47AM -0500, Boris Ostrovsky wrote:
> On 01/22/2016 06:32 PM, Luis R. Rodriguez wrote:
> >On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote:
> >
> >>+/*
> >>+ * This routine (and those that it might call) should not use
> >>+ * anything that lives in .bss since that segment will be cleared later
> >>+ */
> >>+void __init xen_prepare_hvmlite(void)
> >>+{
> >>+	u32 eax, ecx, edx, msr;
> >>+	u64 pfn;
> >>+
> >>+	cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx);
> >>+	pfn = __pa(hypercall_page);
> >>+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> >>+
> >>+	pv_info.name = "Xen HVMlite";
> >>+	xen_domain_type = XEN_HVM_DOMAIN;
> >>+	xen_hvmlite = 1;
> >>+
> >>+	x86_init.oem.arch_setup = xen_init_kernel;
> >>+	x86_init.oem.banner = xen_banner;
> >>+
> >>+	hvmlite_bootparams();
> >>+}
> >>+#endif
> >If the boot_params.hdr.hardware_subarch_data pointed to a custom
> >struct then the first C entry point for Xen could shuffle this and
> >set this too, by still using less asm entry helpers. We'd still
> >need this run but with the linker table I think we could have
> >a stub small stub for hvm run, it would not be a call from asm.
> 
> Perhaps, but someone would still have to set hardware_subarch. And
> it's hvmlite_bootparams() that does it.

No, Xen would do it as well, essentially all of hvmlite_bootparams() could be
done in Xen.

> And that's not sufficient, I think. There are still some things that
> trampoline code sets up (e.g. page tables for 64-bit).

Sure, but only what is required, that should be rather smaller.

  Luis
H. Peter Anvin Jan. 25, 2016, 9:21 p.m. UTC | #14
On 01/25/16 13:12, Luis R. Rodriguez wrote:
>>
>> Perhaps, but someone would still have to set hardware_subarch. And
>> it's hvmlite_bootparams() that does it.
> 
> No, Xen would do it as well, essentially all of hvmlite_bootparams() could be
> done in Xen.
> 

Or a stub code.

	-hpa
Luis Chamberlain Jan. 25, 2016, 10:19 p.m. UTC | #15
On Sat, Jan 23, 2016 at 02:49:36PM +0000, Andrew Cooper wrote:
> On 23/01/2016 00:55, Luis R. Rodriguez wrote:
> > On Fri, Jan 22, 2016 at 4:30 PM, Andrew Cooper
> > <andrew.cooper3@citrix.com> wrote:
> >> the DMLite boot
> >> protocol is OS agnostic, and will be staying that way.
> > What's the DMLite boot protocol?
> 
> It is  a statement of the initial state of a DMLite container.

I see.

> > Is that the protocol that is defined by Xen to boot Xen guests and dom0?
> 
> Technically it is toolstack which constructs this initial state, but
> broadly yes.

I see thanks.

> >  Is this well documented somewhere?
> 
> There is a patch out on the list formalising the ABI in writing. (Roger:
> ping?)

Thanks for this.

> > To be clear are you saying that by no means will Xen change to instead
> > of setting a, say zero-page, it would just want to always stuff a xen
> > struct, pass that to the boot entry, and then expect always the guest
> > kernel to always parse this?
> 
> Correct.  Why do you think we should lumber non-Linux guests with a
> Linux-specific boot protocol?

You should by no means do that, agreed, however one should have the
flexibility to support such types of OSes that use page specific
settings for the OS. The way I see it this is an OS feature and
obviously it can vary.

> Quite apart from the fact that Linux is second to the table here
> (FreeBSD was first),

This means little but just history.

> it causes inappropriate linkage between the
> toolstack and the version of Linux wishing to be booted.

There are ways to do it in such a way that it does not create dependency issues
on Linux specific code.


0) The Xen way and EFI way

A generic data structure is passed to the entry point on the kernel to
load the kernel. The kernel in turn must parse this generic specific struct
and interprets it and translate it to the kernel specific values.

1) The qemu way:

One way is to simply not refer to the boot_params data structures but IMHO that
produces really ugly code. The qemu folks did it this way, so qemu does not use
arch/x86/include/uapi/asm/bootparam.h in any way, instead it uses offsets from
a char *header. Refer to load_linux() in hw/i386/pc.c, for instance:

header[0x211] |= 0x80;  /* CAN_USE_HEAP */

2) The grub way:

Another way, which grub uses, is to define their own data structures based
on arch/x86/include/uapi/asm/bootparam.h, with their own data types, and refer
to them in code, for instance refer to grub_cmd_linux() on the grub file
grub-core/loader/i386/pc/linux.c and its copy definition of the header definition
in include/grub/i386/linux.h.

lh.loadflags |= GRUB_LINUX_FLAG_CAN_USE_HEAP;

The way lguest does it in the lguest launcher is IMHO the cleanest of course,
but it includes asm/bootparam.h and uses that in code:

/* Boot protocol version: 2.07 supports the fields for lguest. */       
boot->hdr.version = 0x207;  

But that does mean copying over or using the bootparam.h file and using
linux data types.

3) Merge of xen way and using subarch_data

Only set the subarch and subarch_data pointer, and have the kernel then
read the generic data structure and parse it as it used to, the benefit
is sharing a common entry point.

No one uses this yet. But I think its a reasonable compromise.

Perhaps there are other ways as well.

> > If true, then by no means, no matter how hard we try, and no matter
> > what we do on the Linux front to help clean things up will we be able
> > to have a unified bare metal / Xen entry. I'm noting it could be
> > possible though provided we do just set the zero page, the subarch to
> > Xen and subarch_data to the Xen custom data structure.
> 
> All you need is a very small stub which starts per the DMLite ABI, sets
> up an appropriate zero_page, and jumps to the native entrypoint.

Right.

> However, this stub belongs in Linux, not in the Xen toolstack.  That
> way, when the Linux boot protocol is modified, both sides can be updated
> accordingly.

Makes sense the different entry points just seems best avoided if possible.

  Luis
Boris Ostrovsky Jan. 25, 2016, 10:28 p.m. UTC | #16
On 01/25/2016 04:21 PM, H. Peter Anvin wrote:
> On 01/25/16 13:12, Luis R. Rodriguez wrote:
>>> Perhaps, but someone would still have to set hardware_subarch. And
>>> it's hvmlite_bootparams() that does it.
>> No, Xen would do it as well, essentially all of hvmlite_bootparams() could be
>> done in Xen.
>>
> Or a stub code.

This patch in fact is the stub for Xen HVMlite guests, after we are done 
with it we jump to bare-metal startup code (i.e startup_32|64)

-boris
Boris Ostrovsky Jan. 25, 2016, 10:55 p.m. UTC | #17
On 01/25/2016 05:19 PM, Luis R. Rodriguez wrote:
> On Sat, Jan 23, 2016 at 02:49:36PM +0000, Andrew Cooper wrote:
>
>
>> it causes inappropriate linkage between the
>> toolstack and the version of Linux wishing to be booted.
> There are ways to do it in such a way that it does not create dependency issues
> on Linux specific code.
>
>
> 0) The Xen way and EFI way
>
> A generic data structure is passed to the entry point on the kernel to
> load the kernel. The kernel in turn must parse this generic specific struct
> and interprets it and translate it to the kernel specific values.
>
> 1) The qemu way:
>
> One way is to simply not refer to the boot_params data structures but IMHO that
> produces really ugly code. The qemu folks did it this way, so qemu does not use
> arch/x86/include/uapi/asm/bootparam.h in any way, instead it uses offsets from
> a char *header. Refer to load_linux() in hw/i386/pc.c, for instance:
>
> header[0x211] |= 0x80;  /* CAN_USE_HEAP */
>
> 2) The grub way:
>
> Another way, which grub uses, is to define their own data structures based
> on arch/x86/include/uapi/asm/bootparam.h, with their own data types, and refer
> to them in code, for instance refer to grub_cmd_linux() on the grub file
> grub-core/loader/i386/pc/linux.c and its copy definition of the header definition
> in include/grub/i386/linux.h.
>
> lh.loadflags |= GRUB_LINUX_FLAG_CAN_USE_HEAP;
>
> The way lguest does it in the lguest launcher is IMHO the cleanest of course,
> but it includes asm/bootparam.h and uses that in code:
>
> /* Boot protocol version: 2.07 supports the fields for lguest. */
> boot->hdr.version = 0x207;
>
> But that does mean copying over or using the bootparam.h file and using
> linux data types.
>
> 3) Merge of xen way and using subarch_data
>
> Only set the subarch and subarch_data pointer, and have the kernel then
> read the generic data structure and parse it as it used to, the benefit
> is sharing a common entry point.
>
> No one uses this yet. But I think its a reasonable compromise.
>
> Perhaps there are other ways as well.
>
>>> If true, then by no means, no matter how hard we try, and no matter
>>> what we do on the Linux front to help clean things up will we be able
>>> to have a unified bare metal / Xen entry. I'm noting it could be
>>> possible though provided we do just set the zero page, the subarch to
>>> Xen and subarch_data to the Xen custom data structure.
>> All you need is a very small stub which starts per the DMLite ABI, sets
>> up an appropriate zero_page, and jumps to the native entrypoint.
> Right.


I am trying to understand what your objection is to what is proposed in 
this patch. It is the size of the stub? If yes -- what would you like to 
leave out to be done later?


-boris

>
>> However, this stub belongs in Linux, not in the Xen toolstack.  That
>> way, when the Linux boot protocol is modified, both sides can be updated
>> accordingly.
> Makes sense the different entry points just seems best avoided if possible.
>
>    Luis
Luis Chamberlain Jan. 26, 2016, 6:34 p.m. UTC | #18
On Mon, Jan 25, 2016 at 05:28:08PM -0500, Boris Ostrovsky wrote:
> On 01/25/2016 04:21 PM, H. Peter Anvin wrote:
> >On 01/25/16 13:12, Luis R. Rodriguez wrote:
> >>>Perhaps, but someone would still have to set hardware_subarch. And
> >>>it's hvmlite_bootparams() that does it.
> >>No, Xen would do it as well, essentially all of hvmlite_bootparams() could be
> >>done in Xen.
> >>
> >Or a stub code.
> 
> This patch in fact is the stub for Xen HVMlite guests, after we are
> done with it we jump to bare-metal startup code (i.e startup_32|64)

Right the point is the stub need not be in Linux, I'll explain in the other
thread where I provided more details on the different known approaches.

  Luis
Andy Lutomirski Jan. 26, 2016, 6:46 p.m. UTC | #19
On Tue, Jan 26, 2016 at 10:34 AM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
> On Mon, Jan 25, 2016 at 05:28:08PM -0500, Boris Ostrovsky wrote:
>> On 01/25/2016 04:21 PM, H. Peter Anvin wrote:
>> >On 01/25/16 13:12, Luis R. Rodriguez wrote:
>> >>>Perhaps, but someone would still have to set hardware_subarch. And
>> >>>it's hvmlite_bootparams() that does it.
>> >>No, Xen would do it as well, essentially all of hvmlite_bootparams() could be
>> >>done in Xen.
>> >>
>> >Or a stub code.
>>
>> This patch in fact is the stub for Xen HVMlite guests, after we are
>> done with it we jump to bare-metal startup code (i.e startup_32|64)
>
> Right the point is the stub need not be in Linux, I'll explain in the other
> thread where I provided more details on the different known approaches.
>

ISTM if the Xen ABI-specified entry point has a different convention
than the Linux native entry, then the stub should live in Linux.  It
would be just a couple if lines of code, right?

The issue that caused headaches in the past isn't that there's code
that's executed only on native, it's that there are whole big
functions that are executed only on native for no good reason and that
aren't clearly marked.

If we had native_start_kernel and xen_start_kernel, and they both
called very quickly in to common_start_kernel, it would be very clear
what's going on.

--Andy
Boris Ostrovsky Jan. 26, 2016, 7 p.m. UTC | #20
On 01/26/2016 01:46 PM, Andy Lutomirski wrote:
> On Tue, Jan 26, 2016 at 10:34 AM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
>> On Mon, Jan 25, 2016 at 05:28:08PM -0500, Boris Ostrovsky wrote:
>>> On 01/25/2016 04:21 PM, H. Peter Anvin wrote:
>>>> On 01/25/16 13:12, Luis R. Rodriguez wrote:
>>>>>> Perhaps, but someone would still have to set hardware_subarch. And
>>>>>> it's hvmlite_bootparams() that does it.
>>>>> No, Xen would do it as well, essentially all of hvmlite_bootparams() could be
>>>>> done in Xen.
>>>>>
>>>> Or a stub code.
>>> This patch in fact is the stub for Xen HVMlite guests, after we are
>>> done with it we jump to bare-metal startup code (i.e startup_32|64)
>> Right the point is the stub need not be in Linux, I'll explain in the other
>> thread where I provided more details on the different known approaches.
>>
> ISTM if the Xen ABI-specified entry point has a different convention
> than the Linux native entry, then the stub should live in Linux.  It
> would be just a couple if lines of code, right?

It's not exactly a couple of lines but it's not large neither. It mainly 
sets up boot_params (similar to what make_boot_params() does for EFI). 
Plus, for 64-bit, it loads identity page tables and switches to long 
mode. And then jumps to bare-meta startup code.


>
> The issue that caused headaches in the past isn't that there's code
> that's executed only on native, it's that there are whole big
> functions that are executed only on native for no good reason and that
> aren't clearly marked.

This won't happen with HVMlite.

>
> If we had native_start_kernel and xen_start_kernel, and they both
> called very quickly in to common_start_kernel, it would be very clear
> what's going on.

What is now xen_start_kernel() is no longer the entry point for HVMlite. 
It is called as x86_init.oem.arch_setup() (or I may even move it to 
x86_hyper_xen.init_platform() or something like that).

-boris
Luis Chamberlain Jan. 26, 2016, 7:14 p.m. UTC | #21
On Tue, Jan 26, 2016 at 11:00 AM, Boris Ostrovsky
<boris.ostrovsky@oracle.com> wrote:
> On 01/26/2016 01:46 PM, Andy Lutomirski wrote:
>>
>> On Tue, Jan 26, 2016 at 10:34 AM, Luis R. Rodriguez <mcgrof@suse.com>
>> wrote:
>>>
>>> On Mon, Jan 25, 2016 at 05:28:08PM -0500, Boris Ostrovsky wrote:
>>>>
>>>> On 01/25/2016 04:21 PM, H. Peter Anvin wrote:
>>>>>
>>>>> On 01/25/16 13:12, Luis R. Rodriguez wrote:
>>>>>>>
>>>>>>> Perhaps, but someone would still have to set hardware_subarch. And
>>>>>>> it's hvmlite_bootparams() that does it.
>>>>>>
>>>>>> No, Xen would do it as well, essentially all of hvmlite_bootparams()
>>>>>> could be
>>>>>> done in Xen.
>>>>>>
>>>>> Or a stub code.
>>>>
>>>> This patch in fact is the stub for Xen HVMlite guests, after we are
>>>> done with it we jump to bare-metal startup code (i.e startup_32|64)
>>>
>>> Right the point is the stub need not be in Linux, I'll explain in the
>>> other
>>> thread where I provided more details on the different known approaches.
>>>
>> ISTM if the Xen ABI-specified entry point has a different convention
>> than the Linux native entry, then the stub should live in Linux.  It
>> would be just a couple if lines of code, right?
>
>
> It's not exactly a couple of lines but it's not large neither. It mainly
> sets up boot_params (similar to what make_boot_params() does for EFI). Plus,
> for 64-bit, it loads identity page tables and switches to long mode. And
> then jumps to bare-meta startup code.

This terse summary provides an example of the issue I'm highlighting.
Even though the stub is small we undermine its impact!

Although the stub is small the different entry point also enables
subtle additions which are required, although they are minimal they
are important and if not considered for new x86 features causes
regressions. I don't care what people tell me about "this should have
been caught by code review, and no one CC'd me -- whaa!" -- this is a
silly expectation and I think we should do better.

Case in point:

xen_start_kernel() has seen regressions now on both cr4_init_shadow()
which you forgot to add to Xen Andy -- and later Boris fixed. Another
example: the latest one is a kasan init -- which to this day remains
fucked up -- a Kasan enabled PV guest crashes, and fixing is no where
in sight. I flagged this a while ago, and I think we should put a
proactive measure in place for that.

The linker table stuff I'm doing was not for kicks -- its a means to
an end here, and although I can't yet read subarch at early init, if
we had it, then things link xen_start_kernel() could just be an x86
init stub, it'd be the first stub Xen runs. You can keep whatever
boot_params setup stub it does today, even though I think its cleaner
done in Xen, but at the very least it could at least instead just
*read* the Xen custom generic structure to parse and set boot_params
by using subarch_data pointer.

I also have been reading the history of code changes on the other
entry points in Linux and even though the code is small I see tons of
commits in there for minor but critical fixes all over, likewise
comments and recommendations and visions to unify / share code. I
refuse to accept that we should undermine the issues of a new entry
point or leaving the situation as-is with dual entry points for x86_64
 / xen if we can instead just cleanly unify even asm entry points.

>> The issue that caused headaches in the past isn't that there's code
>> that's executed only on native, it's that there are whole big
>> functions that are executed only on native for no good reason and that
>> aren't clearly marked.

Its more than that, as I noted.

> This won't happen with HVMlite.

And that's fine, its just I think we can avoid yet-another entry --
even if we still are coming into a common C entry later.

>> If we had native_start_kernel and xen_start_kernel, and they both
>> called very quickly in to common_start_kernel, it would be very clear
>> what's going on.
>
> What is now xen_start_kernel() is no longer the entry point for HVMlite. It
> is called as x86_init.oem.arch_setup() (or I may even move it to
> x86_hyper_xen.init_platform() or something like that).

And that's a huge win! Yet I invite us to consider other prospects to
even merge more and simplify more.

 Luis
Luis Chamberlain Jan. 26, 2016, 8:30 p.m. UTC | #22
On Mon, Jan 25, 2016 at 05:55:02PM -0500, Boris Ostrovsky wrote:
> On 01/25/2016 05:19 PM, Luis R. Rodriguez wrote:
> >On Sat, Jan 23, 2016 at 02:49:36PM +0000, Andrew Cooper wrote:
> >
> >
> >>it causes inappropriate linkage between the
> >>toolstack and the version of Linux wishing to be booted.
> >There are ways to do it in such a way that it does not create dependency issues
> >on Linux specific code.
> >
> >
> >0) The Xen way and EFI way
> >
> >A generic data structure is passed to the entry point on the kernel to
> >load the kernel. The kernel in turn must parse this generic specific struct
> >and interprets it and translate it to the kernel specific values.
> >
> >1) The qemu way:
> >
> >One way is to simply not refer to the boot_params data structures but IMHO that
> >produces really ugly code. The qemu folks did it this way, so qemu does not use
> >arch/x86/include/uapi/asm/bootparam.h in any way, instead it uses offsets from
> >a char *header. Refer to load_linux() in hw/i386/pc.c, for instance:
> >
> >header[0x211] |= 0x80;  /* CAN_USE_HEAP */
> >
> >2) The grub way:
> >
> >Another way, which grub uses, is to define their own data structures based
> >on arch/x86/include/uapi/asm/bootparam.h, with their own data types, and refer
> >to them in code, for instance refer to grub_cmd_linux() on the grub file
> >grub-core/loader/i386/pc/linux.c and its copy definition of the header definition
> >in include/grub/i386/linux.h.
> >
> >lh.loadflags |= GRUB_LINUX_FLAG_CAN_USE_HEAP;
> >
> >The way lguest does it in the lguest launcher is IMHO the cleanest of course,
> >but it includes asm/bootparam.h and uses that in code:
> >
> >/* Boot protocol version: 2.07 supports the fields for lguest. */
> >boot->hdr.version = 0x207;
> >
> >But that does mean copying over or using the bootparam.h file and using
> >linux data types.
> >
> >3) Merge of xen way and using subarch_data
> >
> >Only set the subarch and subarch_data pointer, and have the kernel then
> >read the generic data structure and parse it as it used to, the benefit
> >is sharing a common entry point.
> >
> >No one uses this yet. But I think its a reasonable compromise.
> >
> >Perhaps there are other ways as well.
> >
> >>>If true, then by no means, no matter how hard we try, and no matter
> >>>what we do on the Linux front to help clean things up will we be able
> >>>to have a unified bare metal / Xen entry. I'm noting it could be
> >>>possible though provided we do just set the zero page, the subarch to
> >>>Xen and subarch_data to the Xen custom data structure.
> >>All you need is a very small stub which starts per the DMLite ABI, sets
> >>up an appropriate zero_page, and jumps to the native entrypoint.
> >Right.
> 
> 
> I am trying to understand what your objection is to what is proposed
> in this patch. It is the size of the stub? If yes -- what would you
> like to leave out to be done later?

I explained that here briefly:

> >>However, this stub belongs in Linux, not in the Xen toolstack.  That
> >>way, when the Linux boot protocol is modified, both sides can be updated
> >>accordingly.
>
> >Makes sense the different entry points just seems best avoided if possible.

But let me elaborate:

We want to strive towards avoiding new entry points on Linux at all costs,
there have been historic issues with having multiple entry points, some of these
may relate to Xen, some not so much but one should not ignore history. A few
related patches, this incldues EFI entry points since Konrad brings up Xen's
approach was inspired by that as well. Read from bottom up.

Kasan: not yet fixed, the feature completely ignored Xen PV guests as a
possible candidate for x86. Xen PV guests with Kasan enabled crash. The fix
is not so trivial. What I'd prefer is for kasan to have a feature to be disabled
at run time, and with that in place we can simply at early boot disable Kasan
early on boot on Xen until Kasan is properly implemented and supported on Xen.

5054daa285beaf706f051fbd395dc36c9f0f907f - x86/xen: Initialize cr4 shadow for 64-bit PV(H) guests

Due to xen's separate entry point and the addition of a cr4 shadow, only
native had its feature added, this made Xen crash.

f3670394c29ff3730638762c1760fd2f624e6d7b - Revert "x86/efi: Fixup GOT in all boot code paths"

Fails to boot LInus' Sony Vaio Pro 11

9cb0e394234d244fe5a97e743ec9dd7ddff7e64b - x86/efi: Fixup GOT in all boot code paths

Because of the multiple boot entry points the fixup for GOT needs to be done in
different places.

7e8213c1f3acc064aef37813a39f13cbfe7c3ce7 - x86/efi: Correct EFI boot stub use of code32_start

Title says it.

b8ff87a6158886771677e6dc8139bac6e3cba717 - x86/efi: Firmware agnostic handover entry points

Fixes the issue of a mismatch on 32-bit and 64-bit between firmware and kernel run.

99f857db8857aff691c51302f93648263ed07eb1 - x86, build: Dynamically find entry points in compressed startup code

Boot loaders abuse static entry point. Perhaps not directly related to Xen but its a
lesson to learn that regardless of what you document people may abuse the entry points
in very unexpected ways.

f791620fa7517e1045742c475a7f005db9a634b8 - x86, efi: Fix 32-bit EFI handover protocol entry point

After the EFI handover protocol was added 32-bit boots had an issue.

9ca8f72a9297f2052d806bd1111e176533aa69bd  - x86, efi: Handover Protocol

*Only* use an EFI boot stub for EFI capable to avoid duplicate code in
boot loaders to set up and boot Linux.

51b26ada79b605ed709ddcedbb6012e8f8e0ebed - x86: unify arch/x86/boot/compressed/vmlinux_*.lds

Linus unifies vmlinux_*.lds and calls to unify startup_32() and startup_64().
This hasn't been done yet.

--

As someone new reading all this code / commits all I can do is cringe when I see
yet-another-entry-point being added. I think we can do better, so we should strive
to that unless there are real technical impossibilities here.

What I'm proposing?

1) Lets see if we can put a proactive stop-gap to issues such as the cr4 shadow
bug and Kasan bugs from creeping in. This should not only help PV but perhaps
HVMLite. If you'd like to help with that refer to this thread:

http://lkml.kernel.org/r/CAB=NE6VTCRCazcNpCdJ7pN1eD3=x_fcGOdH37MzVpxkKEN5esw@mail.gmail.com

2) asm code sharing prospects - rebranding of PVH to HVMlite

It sounds like HVMlite is really just a clean PVH implementation so we'll
be doing this it seems according to this approach:

a) Since PVH brand is taken add new new Xen clean solution as "HVMLite implementation"
b) Drop PVH
c) Re-brand HVMLite as PVH

Is there really no asm code to share between PV and HVMlite ? How about between
PV and other Linux asm ? Specifically I'm talking about a huge new entry point
called hvmlite_start_xen() of asm code. How much sharing or duplication
is being avoided here?

3) C code sharing prospects: rebranding of PVH to HVMlite

This code seems to share a lot from PV guest. Can we combine?

4) hardware_subarch, hardware_subarch_data and future prospects

Your patch relies on a *new* Linux entry point. Sure, we had one
for EFI, and sure there is another for Xen PV, but since you're just
rebranding PVH to HVMlite and given historic issues with any new
Linux entry points I'd like for us to take a breather and evaluate
the extent our lofty unification goals, and how the x86 boot protocol
could help with this already.

Now, perhaps today it may seem as difficult to unify asm entry points
today all over, but if we can take baby steps towards that I think that
should be seriously evaluated.

For instance, we should consider on relying on hardware_subarch and
hardware_subarch_data to fetch the hvmlite_start_info by taking advantage of
the x86 boot protocol.  The hvmlite_start_info is what Xen uses to send us info
about the guest as your patch proposes (this matches the Xen PV style entry),
we stash it into a standard Linux boot_params variable called
xen_hvmlite_boot_params for the Xen guest in hvmlite_bootparam(). This
data structure and code seems to match very much the PV guest structure,
why not just use a union and differentiate on PV subtype ? If you want to avoid
a lot of PV calls for HVMlite it seems you could just take advantage of
subarch Xen type, and differentiate on the subarch_data within Xen code
to make that code just PV sepecific.

I only see gains by using the Xen subarch, so would like to know why PC is
being pushed.

  Luis
Boris Ostrovsky Jan. 26, 2016, 9:51 p.m. UTC | #23
On 01/26/2016 03:30 PM, Luis R. Rodriguez wrote:
>
> What I'm proposing?
>
> 1) Lets see if we can put a proactive stop-gap to issues such as the cr4 shadow
> bug and Kasan bugs from creeping in. This should not only help PV but perhaps
> HVMLite. If you'd like to help with that refer to this thread:
>
> http://lkml.kernel.org/r/CAB=NE6VTCRCazcNpCdJ7pN1eD3=x_fcGOdH37MzVpxkKEN5esw@mail.gmail.com
>
> 2) asm code sharing prospects - rebranding of PVH to HVMlite
>
> It sounds like HVMlite is really just a clean PVH implementation so we'll
> be doing this it seems according to this approach:
>
> a) Since PVH brand is taken add new new Xen clean solution as "HVMLite implementation"
> b) Drop PVH
> c) Re-brand HVMLite as PVH
>
> Is there really no asm code to share between PV and HVMlite ?

Not the early boot code.

>   How about between
> PV and other Linux asm ? Specifically I'm talking about a huge new entry point
> called hvmlite_start_xen() of asm code. How much sharing or duplication
> is being avoided here?

I don't see that we can share much here. Especially given that 
hvmlite_start() is a 32-bit entry point so we can't start a 64-bit PV guest.

I suppose we can find some common (C) code that sets boot_params but on 
the first sight they are quite different too.

(And whether it's huge is a matter of opinion ;-). It is more than a 
couple of instructions, I'll give you that)

>
> 3) C code sharing prospects: rebranding of PVH to HVMlite
>
> This code seems to share a lot from PV guest. Can we combine?

HVMlite closely follows HVM (and baremetal) code path. There is some 
common setup code with PV (which is what the second patch in this series 
is trying to factor out)

>
> 4) hardware_subarch, hardware_subarch_data and future prospects
>
> Your patch relies on a *new* Linux entry point. Sure, we had one
> for EFI, and sure there is another for Xen PV, but since you're just
> rebranding PVH to HVMlite and given historic issues with any new
> Linux entry points I'd like for us to take a breather and evaluate
> the extent our lofty unification goals, and how the x86 boot protocol
> could help with this already.

I am not sure I see how you can avoid having new entry point. For 
example, all HVMlite guests start in 32-bit mode. Who will switch to 
long mode?

>
> Now, perhaps today it may seem as difficult to unify asm entry points
> today all over, but if we can take baby steps towards that I think that
> should be seriously evaluated.
>
> For instance, we should consider on relying on hardware_subarch and
> hardware_subarch_data to fetch the hvmlite_start_info by taking advantage of
> the x86 boot protocol.  The hvmlite_start_info is what Xen uses to send us info
> about the guest as your patch proposes (this matches the Xen PV style entry),
> we stash it into a standard Linux boot_params variable called
> xen_hvmlite_boot_params for the Xen guest in hvmlite_bootparam(). This
> data structure and code seems to match very much the PV guest structure,

No, the two don't match at all.

> why not just use a union and differentiate on PV subtype ? If you want to avoid
> a lot of PV calls for HVMlite it seems you could just take advantage of
> subarch Xen type, and differentiate on the subarch_data within Xen code
> to make that code just PV sepecific.
>
> I only see gains by using the Xen subarch, so would like to know why PC is
> being pushed.

It's not that subarch 0 is being pushed here. It's that I don't see how 
it can be useful for this particular guest type. Maybe as we add new 
features (or discover something that we've missed up till now) we can 
switch to using it. If you think we should delay initializing 
boot_params until then --- I will disagree: boot_params are used before 
we look at subarch and I don't believe it makes sense to pick and choose 
what to initialize before and what can wait.

(And I am not sure it can be useful on PV neither, at least the way it 
is used now. You will not reach the point in the (32-bit) code where it 
is tested. You will die way earlier (I think on startup_32()'s fourth 
instruction).)


-boris
Luis Chamberlain Jan. 27, 2016, 12:04 a.m. UTC | #24
On Tue, Jan 26, 2016 at 04:51:38PM -0500, Boris Ostrovsky wrote:
> On 01/26/2016 03:30 PM, Luis R. Rodriguez wrote:

> hvmlite_start() is a 32-bit entry point [...]
>
> >4) hardware_subarch, hardware_subarch_data and future prospects
> >
> >Your patch relies on a *new* Linux entry point. Sure, we had one
> >for EFI, and sure there is another for Xen PV, but since you're just
> >rebranding PVH to HVMlite and given historic issues with any new
> >Linux entry points I'd like for us to take a breather and evaluate
> >the extent our lofty unification goals, and how the x86 boot protocol
> >could help with this already.
> 
> I am not sure I see how you can avoid having new entry point. For
> example, all HVMlite guests start in 32-bit mode. Who will switch to
> long mode?

x86 i386 entry points need to have code to do all that stuff, this can
happen for instance when you boot x86_64 from a 32-bit boot loader,
and I think other things as well are possible that trigger this as well.

> >why not just use a union and differentiate on PV subtype ? If you want to avoid
> >a lot of PV calls for HVMlite it seems you could just take advantage of
> >subarch Xen type, and differentiate on the subarch_data within Xen code
> >to make that code just PV sepecific.
> >
> >I only see gains by using the Xen subarch, so would like to know why PC is
> >being pushed.
> 
> It's not that subarch 0 is being pushed here. It's that I don't see
> how it can be useful for this particular guest type. Maybe as we add
> new features (or discover something that we've missed up till now)
> we can switch to using it. If you think we should delay initializing
> boot_params until then --- I will disagree: boot_params are used
> before we look at subarch and I don't believe it makes sense to pick
> and choose what to initialize before and what can wait.

subarch is part of boot_params, so not sure what I mean by an issue
in timing here. The question is if its set and then how early can
you possibly read the subarch from the boot_params.

> (And I am not sure it can be useful on PV neither, 

Well..

> at least the way it is used now. 

that is the issue.. If we get access to boot_params on early boot
then we can simply share the x86_64 entry point between PVH, PV,
and native x86_64 with what I'm proposing and some minor extensions.

> You will not reach the point in the (32-bit) code
> where it is tested. You will die way earlier (I think on
> startup_32()'s fourth instruction).)

Its a bit different requirement for the subarch for PV/PVH and
for HVMlite. Given what you have explained things are bit clearer
now and I see the issue.

You go:

hvmlite_start_xen() -->
	HVM stub
	startup_64() | (startup_32()

Note at the end of startup_32() though we have a neat asm
set of entries that depend on the subarch type. Perhaps we should
have a PV type early on startup_32() ? And/or I wonder if we can
work off of the EFI boot loader code. Half baked thoughts for now,
sorry have to go.

  Luis
Luis Chamberlain Jan. 27, 2016, 2:16 a.m. UTC | #25
On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote:
> You go:
>
> hvmlite_start_xen() -->
>         HVM stub
>         startup_64() | (startup_32()

Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
be legacy or crap / old, so we'd need a way to catch it if we should
not use that code for this PV type. This begs the question, are you
also sure other callers in startup_32() or startup_64() might be OK as
well where previously guarded with pv_enabled() ?

 Luis
Luis Rodriguez Jan. 27, 2016, 4:54 a.m. UTC | #26
On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote:
>
> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com>
wrote:
> > You go:
> >
> > hvmlite_start_xen() -->
> >         HVM stub
> >         startup_64() | (startup_32()
>
> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
> be legacy or crap / old, so we'd need a way to catch it if we should
> not use that code for this PV type. This begs the question, are you
> also sure other callers in startup_32() or startup_64() might be OK as
> well where previously guarded with pv_enabled() ?

Actually this call can't be used, and if early code used it prior to
setup_arch() it'd be a bug as its only properly set until later. Vetting
for correctness of all code call is still required though and perhaps we do
need something to catch now this PV type on early code such as this one if
we don't want it. From what I've gathered before on other bsp ucode we
don't want ucode loaded for PV guest types through these mechanisms.

  Luis
Konrad Rzeszutek Wilk Jan. 27, 2016, 2:42 p.m. UTC | #27
On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote:
> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote:
> >
> > On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com>
> wrote:
> > > You go:
> > >
> > > hvmlite_start_xen() -->
> > >         HVM stub
> > >         startup_64() | (startup_32()
> >
> > Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
> > rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
> > be legacy or crap / old, so we'd need a way to catch it if we should
> > not use that code for this PV type. This begs the question, are you
> > also sure other callers in startup_32() or startup_64() might be OK as
> > well where previously guarded with pv_enabled() ?
> 
> Actually this call can't be used, and if early code used it prior to
> setup_arch() it'd be a bug as its only properly set until later. Vetting
> for correctness of all code call is still required though and perhaps we do
> need something to catch now this PV type on early code such as this one if
> we don't want it. From what I've gathered before on other bsp ucode we
> don't want ucode loaded for PV guest types through these mechanisms.

It may help to not think of PVH/hvmlite as PV. It really is HVM with a lot
of emulated devices removed.

How does early microcode work on EFI? Does the EFI stub code have an early
microcode loading code ?

> 
>   Luis

> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
David Vrabel Jan. 27, 2016, 2:50 p.m. UTC | #28
On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote:
> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote:
>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote:
>>>
>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com>
>> wrote:
>>>> You go:
>>>>
>>>> hvmlite_start_xen() -->
>>>>         HVM stub
>>>>         startup_64() | (startup_32()
>>>
>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
>>> be legacy or crap / old, so we'd need a way to catch it if we should
>>> not use that code for this PV type. This begs the question, are you
>>> also sure other callers in startup_32() or startup_64() might be OK as
>>> well where previously guarded with pv_enabled() ?
>>
>> Actually this call can't be used, and if early code used it prior to
>> setup_arch() it'd be a bug as its only properly set until later. Vetting
>> for correctness of all code call is still required though and perhaps we do
>> need something to catch now this PV type on early code such as this one if
>> we don't want it. From what I've gathered before on other bsp ucode we
>> don't want ucode loaded for PV guest types through these mechanisms.
> 
> It may help to not think of PVH/hvmlite as PV. It really is HVM with a lot
> of emulated devices removed.
> 
> How does early microcode work on EFI? Does the EFI stub code have an early
> microcode loading code ?

Surely the interesting comparison here is how is (early) microcode
loading disabled in KVM guests?  We should use the same mechanism for
HVMlite guests.

David
Boris Ostrovsky Jan. 27, 2016, 3:06 p.m. UTC | #29
On 01/27/2016 09:50 AM, David Vrabel wrote:
> On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote:
>> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote:
>>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote:
>>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com>
>>> wrote:
>>>>> You go:
>>>>>
>>>>> hvmlite_start_xen() -->
>>>>>          HVM stub
>>>>>          startup_64() | (startup_32()
>>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
>>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
>>>> be legacy or crap / old, so we'd need a way to catch it if we should
>>>> not use that code for this PV type. This begs the question, are you
>>>> also sure other callers in startup_32() or startup_64() might be OK as
>>>> well where previously guarded with pv_enabled() ?
>>> Actually this call can't be used, and if early code used it prior to
>>> setup_arch() it'd be a bug as its only properly set until later. Vetting
>>> for correctness of all code call is still required though and perhaps we do
>>> need something to catch now this PV type on early code such as this one if
>>> we don't want it. From what I've gathered before on other bsp ucode we
>>> don't want ucode loaded for PV guest types through these mechanisms.
>> It may help to not think of PVH/hvmlite as PV. It really is HVM with a lot
>> of emulated devices removed.
>>
>> How does early microcode work on EFI? Does the EFI stub code have an early
>> microcode loading code ?
> Surely the interesting comparison here is how is (early) microcode
> loading disabled in KVM guests?  We should use the same mechanism for
> HVMlite guests.


Why would we ever want to have a guest load microcode during boot? I can 
see how a (privileged) guest may want to load microcode from a shell 
(via microcode driver).

-boris
David Vrabel Jan. 27, 2016, 3:09 p.m. UTC | #30
On 27/01/16 15:06, Boris Ostrovsky wrote:
> On 01/27/2016 09:50 AM, David Vrabel wrote:
>> On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote:
>>> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote:
>>>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote:
>>>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com>
>>>> wrote:
>>>>>> You go:
>>>>>>
>>>>>> hvmlite_start_xen() -->
>>>>>>          HVM stub
>>>>>>          startup_64() | (startup_32()
>>>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
>>>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
>>>>> be legacy or crap / old, so we'd need a way to catch it if we should
>>>>> not use that code for this PV type. This begs the question, are you
>>>>> also sure other callers in startup_32() or startup_64() might be OK as
>>>>> well where previously guarded with pv_enabled() ?
>>>> Actually this call can't be used, and if early code used it prior to
>>>> setup_arch() it'd be a bug as its only properly set until later.
>>>> Vetting
>>>> for correctness of all code call is still required though and
>>>> perhaps we do
>>>> need something to catch now this PV type on early code such as this
>>>> one if
>>>> we don't want it. From what I've gathered before on other bsp ucode we
>>>> don't want ucode loaded for PV guest types through these mechanisms.
>>> It may help to not think of PVH/hvmlite as PV. It really is HVM with
>>> a lot
>>> of emulated devices removed.
>>>
>>> How does early microcode work on EFI? Does the EFI stub code have an
>>> early
>>> microcode loading code ?
>> Surely the interesting comparison here is how is (early) microcode
>> loading disabled in KVM guests?  We should use the same mechanism for
           ^^^^^^^^
>> HVMlite guests.
> 
> 
> Why would we ever want to have a guest load microcode during boot? I can
> see how a (privileged) guest may want to load microcode from a shell
> (via microcode driver).

I think you missed a word when you read my reply.

David
Boris Ostrovsky Jan. 27, 2016, 3:17 p.m. UTC | #31
On 01/27/2016 10:09 AM, David Vrabel wrote:
> On 27/01/16 15:06, Boris Ostrovsky wrote:
>> On 01/27/2016 09:50 AM, David Vrabel wrote:
>>> On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote:
>>>> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote:
>>>>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote:
>>>>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com>
>>>>> wrote:
>>>>>>> You go:
>>>>>>>
>>>>>>> hvmlite_start_xen() -->
>>>>>>>           HVM stub
>>>>>>>           startup_64() | (startup_32()
>>>>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
>>>>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
>>>>>> be legacy or crap / old, so we'd need a way to catch it if we should
>>>>>> not use that code for this PV type. This begs the question, are you
>>>>>> also sure other callers in startup_32() or startup_64() might be OK as
>>>>>> well where previously guarded with pv_enabled() ?
>>>>> Actually this call can't be used, and if early code used it prior to
>>>>> setup_arch() it'd be a bug as its only properly set until later.
>>>>> Vetting
>>>>> for correctness of all code call is still required though and
>>>>> perhaps we do
>>>>> need something to catch now this PV type on early code such as this
>>>>> one if
>>>>> we don't want it. From what I've gathered before on other bsp ucode we
>>>>> don't want ucode loaded for PV guest types through these mechanisms.
>>>> It may help to not think of PVH/hvmlite as PV. It really is HVM with
>>>> a lot
>>>> of emulated devices removed.
>>>>
>>>> How does early microcode work on EFI? Does the EFI stub code have an
>>>> early
>>>> microcode loading code ?
>>> Surely the interesting comparison here is how is (early) microcode
>>> loading disabled in KVM guests?  We should use the same mechanism for
>             ^^^^^^^^
>>> HVMlite guests.
>>
>> Why would we ever want to have a guest load microcode during boot? I can
>> see how a (privileged) guest may want to load microcode from a shell
>> (via microcode driver).
> I think you missed a word when you read my reply.

Yes, I missed it ;-)

Why not continue relying on paravirt_enabled()? We are going to keep 
this in some form for HVMlite.

-boris
Konrad Rzeszutek Wilk Jan. 27, 2016, 3:29 p.m. UTC | #32
On Wed, Jan 27, 2016 at 10:17:56AM -0500, Boris Ostrovsky wrote:
> On 01/27/2016 10:09 AM, David Vrabel wrote:
> >On 27/01/16 15:06, Boris Ostrovsky wrote:
> >>On 01/27/2016 09:50 AM, David Vrabel wrote:
> >>>On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote:
> >>>>On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote:
> >>>>>On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote:
> >>>>>>On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com>
> >>>>>wrote:
> >>>>>>>You go:
> >>>>>>>
> >>>>>>>hvmlite_start_xen() -->
> >>>>>>>          HVM stub
> >>>>>>>          startup_64() | (startup_32()
> >>>>>>Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
> >>>>>>rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
> >>>>>>be legacy or crap / old, so we'd need a way to catch it if we should
> >>>>>>not use that code for this PV type. This begs the question, are you
> >>>>>>also sure other callers in startup_32() or startup_64() might be OK as
> >>>>>>well where previously guarded with pv_enabled() ?
> >>>>>Actually this call can't be used, and if early code used it prior to
> >>>>>setup_arch() it'd be a bug as its only properly set until later.
> >>>>>Vetting
> >>>>>for correctness of all code call is still required though and
> >>>>>perhaps we do
> >>>>>need something to catch now this PV type on early code such as this
> >>>>>one if
> >>>>>we don't want it. From what I've gathered before on other bsp ucode we
> >>>>>don't want ucode loaded for PV guest types through these mechanisms.
> >>>>It may help to not think of PVH/hvmlite as PV. It really is HVM with
> >>>>a lot
> >>>>of emulated devices removed.
> >>>>
> >>>>How does early microcode work on EFI? Does the EFI stub code have an
> >>>>early
> >>>>microcode loading code ?
> >>>Surely the interesting comparison here is how is (early) microcode
> >>>loading disabled in KVM guests?  We should use the same mechanism for
> >            ^^^^^^^^
> >>>HVMlite guests.
> >>
> >>Why would we ever want to have a guest load microcode during boot? I can
> >>see how a (privileged) guest may want to load microcode from a shell
> >>(via microcode driver).
> >I think you missed a word when you read my reply.
> 
> Yes, I missed it ;-)
> 
> Why not continue relying on paravirt_enabled()? We are going to keep this in
> some form for HVMlite.

And this is where Luis comes in. He has posted an patchset which removes the
paravirt_enabled with .. Here is the link https://lkml.org/lkml/2015/12/15/772
Borislav Petkov Jan. 27, 2016, 4:14 p.m. UTC | #33
On Wed, Jan 27, 2016 at 02:50:36PM +0000, David Vrabel wrote:
> Surely the interesting comparison here is how is (early) microcode
> loading disabled in KVM guests?

It isn't - kvm simply ignores the write to the microcode application
MSRs MSR_AMD64_PATCH_LOADER and MSR_IA32_UCODE_REV, respectively.

> We should use the same mechanism for HVMlite guests.

Good idea.
Boris Ostrovsky Jan. 27, 2016, 4:15 p.m. UTC | #34
On 01/27/2016 10:29 AM, Konrad Rzeszutek Wilk wrote:
> On Wed, Jan 27, 2016 at 10:17:56AM -0500, Boris Ostrovsky wrote:
>> On 01/27/2016 10:09 AM, David Vrabel wrote:
>>> On 27/01/16 15:06, Boris Ostrovsky wrote:
>>>> On 01/27/2016 09:50 AM, David Vrabel wrote:
>>>>> On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote:
>>>>>> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote:
>>>>>>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote:
>>>>>>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com>
>>>>>>> wrote:
>>>>>>>>> You go:
>>>>>>>>>
>>>>>>>>> hvmlite_start_xen() -->
>>>>>>>>>           HVM stub
>>>>>>>>>           startup_64() | (startup_32()
>>>>>>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to
>>>>>>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not
>>>>>>>> be legacy or crap / old, so we'd need a way to catch it if we should
>>>>>>>> not use that code for this PV type. This begs the question, are you
>>>>>>>> also sure other callers in startup_32() or startup_64() might be OK as
>>>>>>>> well where previously guarded with pv_enabled() ?
>>>>>>> Actually this call can't be used, and if early code used it prior to
>>>>>>> setup_arch() it'd be a bug as its only properly set until later.
>>>>>>> Vetting
>>>>>>> for correctness of all code call is still required though and
>>>>>>> perhaps we do
>>>>>>> need something to catch now this PV type on early code such as this
>>>>>>> one if
>>>>>>> we don't want it. From what I've gathered before on other bsp ucode we
>>>>>>> don't want ucode loaded for PV guest types through these mechanisms.
>>>>>> It may help to not think of PVH/hvmlite as PV. It really is HVM with
>>>>>> a lot
>>>>>> of emulated devices removed.
>>>>>>
>>>>>> How does early microcode work on EFI? Does the EFI stub code have an
>>>>>> early
>>>>>> microcode loading code ?
>>>>> Surely the interesting comparison here is how is (early) microcode
>>>>> loading disabled in KVM guests?  We should use the same mechanism for
>>>             ^^^^^^^^
>>>>> HVMlite guests.
>>>> Why would we ever want to have a guest load microcode during boot? I can
>>>> see how a (privileged) guest may want to load microcode from a shell
>>>> (via microcode driver).
>>> I think you missed a word when you read my reply.
>> Yes, I missed it ;-)
>>
>> Why not continue relying on paravirt_enabled()? We are going to keep this in
>> some form for HVMlite.
> And this is where Luis comes in. He has posted an patchset which removes the
> paravirt_enabled with .. Here is the link https://lkml.org/lkml/2015/12/15/772

Yes, I saw that and this will be renamed as paravirt_legacy() (which I 
am not sure is really what it should be called.)

Another option is to have early microcode code query CPUID to see 
whether we are running on a hypervisor (this in fact is what we 
originally thought of doing before realizing that we have 
paravirt_enabled()).

But then how is HVMlite different from a regular HVM guest trying to 
load microcode?

(BTW, to answer David's question about what KVM is doing --- it is 
ignoring writes to microcode MSRs, see kvm_set_msr_common().)

-boris
Luis Chamberlain Jan. 27, 2016, 6:48 p.m. UTC | #35
Bleh moving forward please use mcgrof@kernel.org, that will be sent to
my employer and my personal address. More below.

On Wed, Jan 27, 2016 at 8:15 AM, Boris Ostrovsky
<boris.ostrovsky@oracle.com> wrote:
> On 01/27/2016 10:29 AM, Konrad Rzeszutek Wilk wrote:
>> On Wed, Jan 27, 2016 at 10:17:56AM -0500, Boris Ostrovsky wrote:
>>> On 01/27/2016 10:09 AM, David Vrabel wrote:
>>>> On 27/01/16 15:06, Boris Ostrovsky wrote:
>>> Why not continue relying on paravirt_enabled()? We are going to keep this
>>> in
>>> some form for HVMlite.
>>
>> And this is where Luis comes in. He has posted an patchset which removes
>> the
>> paravirt_enabled with .. Here is the link
>> https://lkml.org/lkml/2015/12/15/772
>
>
> Yes, I saw that and this will be renamed as paravirt_legacy() (which I am
> not sure is really what it should be called.)

Given Konrad's pointers about some folks pushing for BIOS to have a
"legacy free option" where all legacy crap (PS/2, PnP, serial port,
parallel port) are all disabled [0], I'm inclined to respin the rename
patch to use x86_legacy_free(). This can later then be extended
provided such BIOS check becomes available.

[0] http://lkml.kernel.org/r/20160120193241.GA4769@char.us.oracle.com

But -- as I also pointed out to hpa recently, there is an issue with
using things like cpu_has_hypervisor() (or in this case
paravirt_enabled()) for early code, given that it relies on
init_hypervisor_platform() having been called first which is called on
setup_arch() [1]. So paravirt_enabled() really can only be used prior
to setup_arch() correctly (specifically after,
init_hypervisor_platform()), and anything else would be a bug. I
learned the hard way while doing some linker table work and trying to
find a good use / solution to the dead code concerns I've been aiming
to address due to Xen's separate entry point and the nature of pvops
[2] [3]. hpa's response to this issue was that we cannot and should
not abuse the boot_params hardware_subarch for this purpose (in this
case I was suggesting perhaps KVM could use it if it in the future
needed it for a kvm check earlier than setup_arch()), but he noted
that:

  "If you have a genuine need for a "hypervisor type" then that is a
separate thing and should be
   treated separately from subarch.  However, you need to consider
that some hypervisors can
  emulate other hypervisors and you may have more than one hypervisor
API available."

This goes along with my suggestion here earlier where I mentioned that
subarch is already used nicely to pivot off some specific subarch
entry after startup_32(), if we want to avoid yet-another-entry for
HVMlite (the PVH rebrand done cleanly) and still use startup_32() for
it we could perhaps follow similar strategy as with subarch but
instead add a hypervisor type and use that for a stub call the
hypervisor type if needed early on in startup_32(). This could perhaps
in turn also be used as a generic hypervisor type / and more robust /
easier / not-so-convoluted check. For instance of what I think is a
convoluted check refer to snd_intel8x0_inside_vm() in
sound/pci/intel8x0.c with its use of kvm_para_available(), #ifdefery
over boot_cpu_has(X86_FEATURE_HYPERVISOR) (which is just
cpu_has_hypervisor()). If we had a hypervisor type easily accessible
it could both be used by early init code and perhaps driver code to
replace these convoluted checks.

If this seems a bit sensible the next question to ask if -- how we'd
*set* the hypervisor type, do we extend the x86 boot protocol and
enable a hypervisor type on the zero page? That would clearly only
make sense if:

1) Its reasonable to x86 maintainers (perhaps pending this
discussion's outcome? If its preemptively known to not reasonable an
alternative suggestion would be appreciated)
2) Xen is willing to actually set this (and perhaps a new
hypervisor_type_data for the private data structure), but not much
more would be needed, and incorporate this as part of the DmLite
protocol, or whatever its called as an option for some OSes
3) The kernel could get access to this value from the zero page really
early on, immediately following startup_32()

Worth mentioning here also is hpa's clarification on when subarch type
PC (0) should be used: [it should be used if the hardware is]
"enumerable using standard PC mechanisms (PCI, ACPI) and doesn't need
a special boot flow" -- does that fit HVMLite's description so far? If
so then The Xen subarch may need to be redefined as well to be clear
what it means. I don't think we need to be precise but at the very
least cover grounds to enable the definitions to meet its actual use
to not confuse users.

[1] http://lkml.kernel.org/r/CAB=NE6WoKsP8+KGnJEtigWYktCMjg6iherCOcq-jskxi4P2QqA@mail.gmail.com
[2] http://www.do-not-panic.com/2015/12/avoiding-dead-code-pvops-not-silver-bullet.html
[3] http://www.do-not-panic.com/2015/12/xen-and-x86-linux-zero-page.html
[4] http://lkml.kernel.org/r/56A17B01.9040708@zytor.com

> Another option is to have early microcode code query CPUID to see whether we
> are running on a hypervisor (this in fact is what we originally thought of
> doing before realizing that we have paravirt_enabled()).

Please keep in mind we want to consider use and setting of something
generic, not just a solution for Xen, and if we want to use this to
help avoid a new entry point then strive for something we actually
could get access to very early on the first x86 entry point, both for
32-bit and 64-bit. If its accessible for both 32-bit and 64-bit well
hell, we could unify not only the entry points for HVMLite thing, but
I'm pretty sure also unify the entry points for PV and x86 as well (if
we wanted that) !

  Luis
Luis Chamberlain Jan. 27, 2016, 7 p.m. UTC | #36
On Wed, Jan 27, 2016 at 10:48 AM, Luis R. Rodriguez <mcgrof@kernel.org> wrote:
>
> Worth mentioning here also is hpa's clarification on when subarch type
> PC (0) should be used: [it should be used if the hardware is]
> "enumerable using standard PC mechanisms (PCI, ACPI) and doesn't need
> a special boot flow" -- does that fit HVMLite's description so far? If
> so then The Xen subarch may need to be redefined as well to be clear
> what it means. I don't think we need to be precise but at the very
> least cover grounds to enable the definitions to meet its actual use
> to not confuse users.

Another thing to consider for HVMlite is that if the 0 subarch (PC) is
used in light of my linker table work and x86's use of it with the
subarch and supported subarch bitmask, is that it would also mean
HVMLite would run all routines currently pegged as needing PC type
(the current KVM and bare metal path) and it would mean not running
anything only pegged with Xen subarch type (but note that today Xen
doesn't even set the subarch type). If there is nothing in common
between PV and HVMlite (no x86 init calls to share), and if HVMLite
*can* call *alllllllll* PC init calls, then by all means this is fine,
and if we just need to distinguish stuff between PC types that's fine,
it may still be possible to further extend hypervisor_type to the x86
init calls I'm adding as another supported_hyper_types to ensure even
though a subarch is being used, that we also check the supported
hypervisor type as well.

 Luis
Boris Ostrovsky Jan. 27, 2016, 11:58 p.m. UTC | #37
On 01/27/2016 02:00 PM, Luis R. Rodriguez wrote:
> On Wed, Jan 27, 2016 at 10:48 AM, Luis R. Rodriguez <mcgrof@kernel.org> wrote:
>> Worth mentioning here also is hpa's clarification on when subarch type
>> PC (0) should be used: [it should be used if the hardware is]
>> "enumerable using standard PC mechanisms (PCI, ACPI) and doesn't need
>> a special boot flow" -- does that fit HVMLite's description so far? If
>> so then The Xen subarch may need to be redefined as well to be clear
>> what it means. I don't think we need to be precise but at the very
>> least cover grounds to enable the definitions to meet its actual use
>> to not confuse users.
> Another thing to consider for HVMlite is that if the 0 subarch (PC) is
> used in light of my linker table work and x86's use of it with the
> subarch and supported subarch bitmask, is that it would also mean
> HVMLite would run all routines currently pegged as needing PC type
> (the current KVM and bare metal path) and it would mean not running
> anything only pegged with Xen subarch type (but note that today Xen
> doesn't even set the subarch type). If there is nothing in common
> between PV and HVMlite (no x86 init calls to share), and if HVMLite
> *can* call *alllllllll* PC init calls, then by all means this is fine,

Yes, that's the idea. HVMlite jumps to startup_32|64 from the stub and 
runs from there with subarch 0.

-boris

> and if we just need to distinguish stuff between PC types that's fine,
> it may still be possible to further extend hypervisor_type to the x86
> init calls I'm adding as another supported_hyper_types to ensure even
> though a subarch is being used, that we also check the supported
> hypervisor type as well.
>
>   Luis
diff mbox

Patch

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index e47e527..1d913d7 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -23,3 +23,4 @@  obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
 obj-$(CONFIG_XEN_DOM0)		+= vga.o
 obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o
 obj-$(CONFIG_XEN_EFI)		+= efi.o
+obj-$(CONFIG_XEN_PVHVM) 	+= xen-hvmlite.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2cf446a..2ed8b2b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -118,7 +118,8 @@  DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
  */
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 
-enum xen_domain_type xen_domain_type = XEN_NATIVE;
+enum xen_domain_type xen_domain_type
+	__attribute__((section(".data"))) = XEN_NATIVE;
 EXPORT_SYMBOL_GPL(xen_domain_type);
 
 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
@@ -171,6 +172,17 @@  struct tls_descs {
  */
 static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
 
+#ifdef CONFIG_XEN_PVHVM
+/*
+ * HVMlite variables. These need to live in data segment since they are
+ * initialized before startup_{32|64}, which clear .bss, are invoked.
+ */
+int xen_hvmlite __attribute__((section(".data"))) = 0;
+struct hvm_start_info hvmlite_start_info __attribute__((section(".data")));
+uint hvmlite_start_info_sz = sizeof(hvmlite_start_info);
+struct boot_params xen_hvmlite_boot_params __attribute__((section(".data")));
+#endif
+
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@@ -1736,6 +1748,83 @@  asmlinkage __visible void __init xen_start_kernel(void)
 #endif
 }
 
+#ifdef CONFIG_XEN_PVHVM
+static void __init hvmlite_bootparams(void)
+{
+	struct xen_memory_map memmap;
+	int i;
+
+	memset(&xen_hvmlite_boot_params, 0, sizeof(xen_hvmlite_boot_params));
+
+	memmap.nr_entries = ARRAY_SIZE(xen_hvmlite_boot_params.e820_map);
+	set_xen_guest_handle(memmap.buffer, xen_hvmlite_boot_params.e820_map);
+	if (HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) {
+		xen_raw_console_write("XENMEM_memory_map failed\n");
+		BUG();
+	}
+
+	xen_hvmlite_boot_params.e820_map[memmap.nr_entries].addr =
+		ISA_START_ADDRESS;
+	xen_hvmlite_boot_params.e820_map[memmap.nr_entries].size =
+		ISA_END_ADDRESS - ISA_START_ADDRESS;
+	xen_hvmlite_boot_params.e820_map[memmap.nr_entries++].type =
+		E820_RESERVED;
+
+	sanitize_e820_map(xen_hvmlite_boot_params.e820_map,
+			  ARRAY_SIZE(xen_hvmlite_boot_params.e820_map),
+			  &memmap.nr_entries);
+
+	xen_hvmlite_boot_params.e820_entries = memmap.nr_entries;
+	for (i = 0; i < xen_hvmlite_boot_params.e820_entries; i++)
+		e820_add_region(xen_hvmlite_boot_params.e820_map[i].addr,
+				xen_hvmlite_boot_params.e820_map[i].size,
+				xen_hvmlite_boot_params.e820_map[i].type);
+
+	xen_hvmlite_boot_params.hdr.cmd_line_ptr =
+		hvmlite_start_info.cmdline_paddr;
+
+	/* The first module is always ramdisk */
+	if (hvmlite_start_info.nr_modules) {
+		struct hvm_modlist_entry *modaddr =
+			__va(hvmlite_start_info.modlist_paddr);
+		xen_hvmlite_boot_params.hdr.ramdisk_image = modaddr->paddr;
+		xen_hvmlite_boot_params.hdr.ramdisk_size = modaddr->size;
+	}
+
+	/*
+	 * See Documentation/x86/boot.txt.
+	 *
+	 * Version 2.12 supports Xen entry point but we will use default x86/PC
+	 * environment (i.e. hardware_subarch 0).
+	 */
+	xen_hvmlite_boot_params.hdr.version = 0x212;
+	xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */
+}
+
+/*
+ * This routine (and those that it might call) should not use
+ * anything that lives in .bss since that segment will be cleared later
+ */
+void __init xen_prepare_hvmlite(void)
+{
+	u32 eax, ecx, edx, msr;
+	u64 pfn;
+
+	cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx);
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	pv_info.name = "Xen HVMlite";
+	xen_domain_type = XEN_HVM_DOMAIN;
+	xen_hvmlite = 1;
+
+	x86_init.oem.arch_setup = xen_init_kernel;
+	x86_init.oem.banner = xen_banner;
+
+	hvmlite_bootparams();
+}
+#endif
+
 void __ref xen_hvm_init_shared_info(void)
 {
 	int cpu;
diff --git a/arch/x86/xen/xen-hvmlite.S b/arch/x86/xen/xen-hvmlite.S
new file mode 100644
index 0000000..90f03d0
--- /dev/null
+++ b/arch/x86/xen/xen-hvmlite.S
@@ -0,0 +1,158 @@ 
+/*
+ * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+	.code32
+	.text
+#define _pa(x)          ((x) - __START_KERNEL_map)
+
+#include <linux/elfnote.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/asm.h>
+#include <asm/boot.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <xen/interface/elfnote.h>
+
+	__HEAD
+	.code32
+
+/* Entry point for HVMlite guests */
+ENTRY(hvmlite_start_xen)
+	cli
+	cld
+
+	mov $_pa(gdt), %eax
+	lgdt (%eax)
+
+	movl $(__BOOT_DS),%eax
+	movl %eax,%ds
+	movl %eax,%es
+	movl %eax,%ss
+
+	/* Stash hvm_start_info */
+	mov $_pa(hvmlite_start_info), %edi
+	mov %ebx, %esi
+	mov $_pa(hvmlite_start_info_sz), %ecx
+	mov (%ecx), %ecx
+	rep
+	movsb
+
+	movl $_pa(early_stack_end), %eax
+	movl %eax, %esp
+
+	/* Enable PAE mode */
+	movl %cr4, %eax
+	orl $X86_CR4_PAE, %eax
+	movl %eax, %cr4
+
+#ifdef CONFIG_X86_64
+	/* Enable Long mode */
+	movl $MSR_EFER, %ecx
+	rdmsr
+	btsl $_EFER_LME, %eax
+	wrmsr
+
+	/* Enable pre-constructed page tables */
+	mov $_pa(init_level4_pgt), %eax
+	movl %eax, %cr3
+	movl $(X86_CR0_PG | X86_CR0_PE), %eax
+	movl %eax, %cr0
+
+	/* Jump to 64-bit mode. */
+	pushl $__KERNEL_CS
+	leal _pa(1f), %eax
+	pushl %eax
+	lret
+
+	/* 64-bit entry point */
+	.code64
+1:
+	call xen_prepare_hvmlite
+
+	/* startup_64 expects boot_params in %rsi */
+	mov $_pa(xen_hvmlite_boot_params), %rsi
+	movq $_pa(startup_64), %rax
+	jmp *%rax
+
+#else /* CONFIG_X86_64 */
+
+	/* Use initial_page table and set level 2 to map 2M pages */
+	movl $_pa(initial_pg_pmd), %edi
+	movl $(_PAGE_PSE | _PAGE_RW | _PAGE_PRESENT), %eax
+	movl $2048, %ecx
+2:
+	movl %eax, 0(%edi)
+	addl $0x00200000, %eax
+	addl $8, %edi
+	decl %ecx
+	jnz 2b
+
+	/* Enable the boot paging */
+	movl $_pa(initial_page_table), %eax
+	movl %eax, %cr3
+	movl %cr0, %eax
+	orl $(X86_CR0_PG | X86_CR0_PE), %eax
+	movl %eax, %cr0
+
+	ljmp $__BOOT_CS,$3f
+3:
+	call xen_prepare_hvmlite
+	mov $_pa(xen_hvmlite_boot_params), %esi
+
+	/* startup_32 doesn't expect paging and PAE to be on */
+	ljmp $__BOOT_CS,$_pa(4f)
+4:
+	movl %cr0, %eax
+	andl $~X86_CR0_PG, %eax
+	movl %eax, %cr0
+	movl %cr4, %eax
+	andl $~X86_CR4_PAE, %eax
+	movl %eax, %cr4
+
+	/* Restore initial_pg_pmd to its original (zero) state */
+	movl $_pa(initial_pg_pmd), %edi
+	xorl %eax, %eax
+	movl $(PAGE_SIZE/4), %ecx
+	rep stosl
+
+	ljmp    $0x10, $_pa(startup_32)
+#endif
+
+	.data
+gdt:
+	.word	gdt_end - gdt
+	.long	_pa(gdt)
+	.word	0
+	.quad	0x0000000000000000 /* NULL descriptor */
+#ifdef CONFIG_X86_64
+	.quad	0x00af9a000000ffff /* __KERNEL_CS */
+#else
+	.quad	0x00cf9a000000ffff /* __KERNEL_CS */
+#endif
+	.quad	0x00cf92000000ffff /* __KERNEL_DS */
+gdt_end:
+
+	.bss
+	.balign 4
+early_stack:
+	.fill 16, 1, 0
+early_stack_end:
+
+	ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
+	             _ASM_PTR (hvmlite_start_xen - __START_KERNEL_map))
diff --git a/include/xen/xen.h b/include/xen/xen.h
index 0c0e3ef..6a0d3f3 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -29,6 +29,12 @@  extern enum xen_domain_type xen_domain_type;
 #define xen_initial_domain()	(0)
 #endif	/* CONFIG_XEN_DOM0 */
 
+#ifdef CONFIG_XEN_PVHVM
+extern int xen_hvmlite;
+#else
+#define xen_hvmlite		(0)
+#endif
+
 #ifdef CONFIG_XEN_PVH
 /* This functionality exists only for x86. The XEN_PVHVM support exists
  * only in x86 world - hence on ARM it will be always disabled.