Message ID | 1453498558-6028-5-git-send-email-boris.ostrovsky@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote: > Start HVMlite guest XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall > page, initialize boot_params, enable early page tables. > > Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> > --- > arch/x86/xen/Makefile | 1 + > arch/x86/xen/enlighten.c | 91 +++++++++++++++++++++++++- > arch/x86/xen/xen-hvmlite.S | 158 ++++++++++++++++++++++++++++++++++++++++++++ > include/xen/xen.h | 6 ++ > 4 files changed, 255 insertions(+), 1 deletions(-) > create mode 100644 arch/x86/xen/xen-hvmlite.S > > diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile > index e47e527..1d913d7 100644 > --- a/arch/x86/xen/Makefile > +++ b/arch/x86/xen/Makefile > @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o > obj-$(CONFIG_XEN_DOM0) += vga.o > obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o > obj-$(CONFIG_XEN_EFI) += efi.o > +obj-$(CONFIG_XEN_PVHVM) += xen-hvmlite.o > diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c > index 2cf446a..2ed8b2b 100644 > --- a/arch/x86/xen/enlighten.c > +++ b/arch/x86/xen/enlighten.c > @@ -118,7 +118,8 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); > */ > DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); > > -enum xen_domain_type xen_domain_type = XEN_NATIVE; > +enum xen_domain_type xen_domain_type > + __attribute__((section(".data"))) = XEN_NATIVE; > EXPORT_SYMBOL_GPL(xen_domain_type); But why? This is not explained. > > unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; > @@ -171,6 +172,17 @@ struct tls_descs { > */ > static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); > > +#ifdef CONFIG_XEN_PVHVM > +/* > + * HVMlite variables. These need to live in data segment since they are > + * initialized before startup_{32|64}, which clear .bss, are invoked. > + */ I guess for this reason.. Perhaps a bit more clear: /* * HVMlite variables. These need to live in data segment since they are * before before startup_{32|64} is invoked, otherwise they would be cleared. */ > +int xen_hvmlite __attribute__((section(".data"))) = 0; > +struct hvm_start_info hvmlite_start_info __attribute__((section(".data"))); > +uint hvmlite_start_info_sz = sizeof(hvmlite_start_info); > +struct boot_params xen_hvmlite_boot_params __attribute__((section(".data"))); > +#endif > + > static void clamp_max_cpus(void) > { > #ifdef CONFIG_SMP > @@ -1736,6 +1748,83 @@ asmlinkage __visible void __init xen_start_kernel(void) > #endif > } > > +#ifdef CONFIG_XEN_PVHVM > +static void __init hvmlite_bootparams(void) > +{ Hrm. > + struct xen_memory_map memmap; > + int i; > + > + memset(&xen_hvmlite_boot_params, 0, sizeof(xen_hvmlite_boot_params)); > + > + memmap.nr_entries = ARRAY_SIZE(xen_hvmlite_boot_params.e820_map); > + set_xen_guest_handle(memmap.buffer, xen_hvmlite_boot_params.e820_map); > + if (HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) { > + xen_raw_console_write("XENMEM_memory_map failed\n"); > + BUG(); > + } > + > + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].addr = > + ISA_START_ADDRESS; > + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].size = > + ISA_END_ADDRESS - ISA_START_ADDRESS; > + xen_hvmlite_boot_params.e820_map[memmap.nr_entries++].type = > + E820_RESERVED; > + > + sanitize_e820_map(xen_hvmlite_boot_params.e820_map, > + ARRAY_SIZE(xen_hvmlite_boot_params.e820_map), > + &memmap.nr_entries); > + > + xen_hvmlite_boot_params.e820_entries = memmap.nr_entries; > + for (i = 0; i < xen_hvmlite_boot_params.e820_entries; i++) > + e820_add_region(xen_hvmlite_boot_params.e820_map[i].addr, > + xen_hvmlite_boot_params.e820_map[i].size, > + xen_hvmlite_boot_params.e820_map[i].type); > + > + xen_hvmlite_boot_params.hdr.cmd_line_ptr = > + hvmlite_start_info.cmdline_paddr; > + > + /* The first module is always ramdisk */ > + if (hvmlite_start_info.nr_modules) { > + struct hvm_modlist_entry *modaddr = > + __va(hvmlite_start_info.modlist_paddr); > + xen_hvmlite_boot_params.hdr.ramdisk_image = modaddr->paddr; > + xen_hvmlite_boot_params.hdr.ramdisk_size = modaddr->size; > + } > + > + /* > + * See Documentation/x86/boot.txt. > + * > + * Version 2.12 supports Xen entry point but we will use default x86/PC > + * environment (i.e. hardware_subarch 0). > + */ > + xen_hvmlite_boot_params.hdr.version = 0x212; > + xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */ > +} I realize PV got away with setting up boot_params on C code but best ask now that this new code is being introduced: why can't we just have the Xen hypervisor fill this in? It'd save us all this code. On this page I show the difference, as an example of what this would look like in contrast to how lguest set things up as an example in a very clean way: http://www.do-not-panic.com/2015/12/xen-and-x86-linux-zero-page.html > +/* > + * This routine (and those that it might call) should not use > + * anything that lives in .bss since that segment will be cleared later > + */ > +void __init xen_prepare_hvmlite(void) > +{ > + u32 eax, ecx, edx, msr; > + u64 pfn; > + > + cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx); > + pfn = __pa(hypercall_page); > + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); > + > + pv_info.name = "Xen HVMlite"; > + xen_domain_type = XEN_HVM_DOMAIN; > + xen_hvmlite = 1; > + > + x86_init.oem.arch_setup = xen_init_kernel; > + x86_init.oem.banner = xen_banner; > + > + hvmlite_bootparams(); > +} > +#endif If the boot_params.hdr.hardware_subarch_data pointed to a custom struct then the first C entry point for Xen could shuffle this and set this too, by still using less asm entry helpers. We'd still need this run but with the linker table I think we could have a stub small stub for hvm run, it would not be a call from asm. > + > void __ref xen_hvm_init_shared_info(void) > { > int cpu; > diff --git a/arch/x86/xen/xen-hvmlite.S b/arch/x86/xen/xen-hvmlite.S > new file mode 100644 > index 0000000..90f03d0 > --- /dev/null > +++ b/arch/x86/xen/xen-hvmlite.S > @@ -0,0 +1,158 @@ > +/* > + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program. If not, see <http://www.gnu.org/licenses/>. > + */ > + > + .code32 > + .text > +#define _pa(x) ((x) - __START_KERNEL_map) > + > +#include <linux/elfnote.h> > +#include <linux/init.h> > +#include <linux/linkage.h> > +#include <asm/segment.h> > +#include <asm/asm.h> > +#include <asm/boot.h> > +#include <asm/processor-flags.h> > +#include <asm/msr.h> > +#include <xen/interface/elfnote.h> > + > + __HEAD > + .code32 > + > +/* Entry point for HVMlite guests */ > +ENTRY(hvmlite_start_xen) Yet another asm entry point for Linux, could it perhaps someway be possible to just share some of this asm entry code in a clean way ? > + cli > + cld > + > + mov $_pa(gdt), %eax > + lgdt (%eax) > + > + movl $(__BOOT_DS),%eax > + movl %eax,%ds > + movl %eax,%es > + movl %eax,%ss > + > + /* Stash hvm_start_info */ > + mov $_pa(hvmlite_start_info), %edi > + mov %ebx, %esi > + mov $_pa(hvmlite_start_info_sz), %ecx > + mov (%ecx), %ecx > + rep > + movsb > + > + movl $_pa(early_stack_end), %eax > + movl %eax, %esp > + > + /* Enable PAE mode */ > + movl %cr4, %eax > + orl $X86_CR4_PAE, %eax > + movl %eax, %cr4 > + > +#ifdef CONFIG_X86_64 > + /* Enable Long mode */ > + movl $MSR_EFER, %ecx > + rdmsr > + btsl $_EFER_LME, %eax > + wrmsr > + > + /* Enable pre-constructed page tables */ > + mov $_pa(init_level4_pgt), %eax > + movl %eax, %cr3 > + movl $(X86_CR0_PG | X86_CR0_PE), %eax > + movl %eax, %cr0 > + > + /* Jump to 64-bit mode. */ > + pushl $__KERNEL_CS > + leal _pa(1f), %eax > + pushl %eax > + lret > + > + /* 64-bit entry point */ > + .code64 > +1: > + call xen_prepare_hvmlite > + > + /* startup_64 expects boot_params in %rsi */ > + mov $_pa(xen_hvmlite_boot_params), %rsi > + movq $_pa(startup_64), %rax Nice! But again why can't the Xen hypervisor just set the boot_params? All other Linux loaders do it. Why is Xen special? Luis > + jmp *%rax > + > +#else /* CONFIG_X86_64 */ > + > + /* Use initial_page table and set level 2 to map 2M pages */ > + movl $_pa(initial_pg_pmd), %edi > + movl $(_PAGE_PSE | _PAGE_RW | _PAGE_PRESENT), %eax > + movl $2048, %ecx > +2: > + movl %eax, 0(%edi) > + addl $0x00200000, %eax > + addl $8, %edi > + decl %ecx > + jnz 2b > + > + /* Enable the boot paging */ > + movl $_pa(initial_page_table), %eax > + movl %eax, %cr3 > + movl %cr0, %eax > + orl $(X86_CR0_PG | X86_CR0_PE), %eax > + movl %eax, %cr0 > + > + ljmp $__BOOT_CS,$3f > +3: > + call xen_prepare_hvmlite > + mov $_pa(xen_hvmlite_boot_params), %esi > + > + /* startup_32 doesn't expect paging and PAE to be on */ > + ljmp $__BOOT_CS,$_pa(4f) > +4: > + movl %cr0, %eax > + andl $~X86_CR0_PG, %eax > + movl %eax, %cr0 > + movl %cr4, %eax > + andl $~X86_CR4_PAE, %eax > + movl %eax, %cr4 > + > + /* Restore initial_pg_pmd to its original (zero) state */ > + movl $_pa(initial_pg_pmd), %edi > + xorl %eax, %eax > + movl $(PAGE_SIZE/4), %ecx > + rep stosl > + > + ljmp $0x10, $_pa(startup_32) > +#endif > + > + .data > +gdt: > + .word gdt_end - gdt > + .long _pa(gdt) > + .word 0 > + .quad 0x0000000000000000 /* NULL descriptor */ > +#ifdef CONFIG_X86_64 > + .quad 0x00af9a000000ffff /* __KERNEL_CS */ > +#else > + .quad 0x00cf9a000000ffff /* __KERNEL_CS */ > +#endif > + .quad 0x00cf92000000ffff /* __KERNEL_DS */ > +gdt_end: > + > + .bss > + .balign 4 > +early_stack: > + .fill 16, 1, 0 > +early_stack_end: > + > + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, > + _ASM_PTR (hvmlite_start_xen - __START_KERNEL_map)) > diff --git a/include/xen/xen.h b/include/xen/xen.h > index 0c0e3ef..6a0d3f3 100644 > --- a/include/xen/xen.h > +++ b/include/xen/xen.h > @@ -29,6 +29,12 @@ extern enum xen_domain_type xen_domain_type; > #define xen_initial_domain() (0) > #endif /* CONFIG_XEN_DOM0 */ > > +#ifdef CONFIG_XEN_PVHVM > +extern int xen_hvmlite; > +#else > +#define xen_hvmlite (0) > +#endif > + > #ifdef CONFIG_XEN_PVH > /* This functionality exists only for x86. The XEN_PVHVM support exists > * only in x86 world - hence on ARM it will be always disabled. > -- > 1.7.1 > >
On 22/01/2016 23:32, Luis R. Rodriguez wrote: > On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote: >> + /* >> + * See Documentation/x86/boot.txt. >> + * >> + * Version 2.12 supports Xen entry point but we will use default x86/PC >> + * environment (i.e. hardware_subarch 0). >> + */ >> + xen_hvmlite_boot_params.hdr.version = 0x212; >> + xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */ >> +} > I realize PV got away with setting up boot_params on C code but best > ask now that this new code is being introduced: why can't we just have > the Xen hypervisor fill this in? It'd save us all this code. I agree that this looks to be a mess. Having said that, the DMLite boot protocol is OS agnostic, and will be staying that way. It happens to look suspiciously like multiboot; a flat 32bit protected mode entry (at a location chosen in an ELF note), with %ebx pointing to an in-ram structure containing things like a command line and module list. I would have though the correct way to do direct Linux support would be to have a very small init stub which constructs an appropriate zero page, and lets the native entry point get on with things. This covers the usecase where people wish to boot a specific Linux kernel straight out of the dom0 filesystem. For the alternative usecase of general OS support, dom0 would boot something such as grub2 as the DMLite "kernel", at which point all stooging around in the guests filesystem is done from guest context, rather than control context (mitigating a substantial attack surface). ~Andrew
On Fri, Jan 22, 2016 at 4:30 PM, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > I would have though the correct way to do direct Linux support would be > to have a very small init stub which constructs an appropriate zero > page, and lets the native entry point get on with things. As hpa noted recently in another thread [0] that is precisely what hardware_subarch and hardware_subarch_data was meant to be used for, and its what I'm alluding to. The only thing though is that as far as we're concerned on x86 we had expected use of hardware_subarch and hardware_subarch_data only for PV, and not for HVM. This seems to be HVM related, but I think this is just a rebranding of PVH to HVMLite, right, so I think the use case of hardware_subarch and hardware_subarch_data are still welcomed as expected in the original design. [0] http://lkml.kernel.org/r/56A130B5.8060701@zytor.com Luis
On Fri, Jan 22, 2016 at 4:30 PM, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > the DMLite boot > protocol is OS agnostic, and will be staying that way. What's the DMLite boot protocol? Is that the protocol that is defined by Xen to boot Xen guests and dom0? Is this well documented somewhere? To be clear are you saying that by no means will Xen change to instead of setting a, say zero-page, it would just want to always stuff a xen struct, pass that to the boot entry, and then expect always the guest kernel to always parse this? If true, then by no means, no matter how hard we try, and no matter what we do on the Linux front to help clean things up will we be able to have a unified bare metal / Xen entry. I'm noting it could be possible though provided we do just set the zero page, the subarch to Xen and subarch_data to the Xen custom data structure. Luis
On 23/01/2016 00:55, Luis R. Rodriguez wrote: > On Fri, Jan 22, 2016 at 4:30 PM, Andrew Cooper > <andrew.cooper3@citrix.com> wrote: >> the DMLite boot >> protocol is OS agnostic, and will be staying that way. > What's the DMLite boot protocol? It is a statement of the initial state of a DMLite container. > Is that the protocol that is defined by Xen to boot Xen guests and dom0? Technically it is toolstack which constructs this initial state, but broadly yes. > Is this well documented somewhere? There is a patch out on the list formalising the ABI in writing. (Roger: ping?) > To be clear are you saying that by no means will Xen change to instead > of setting a, say zero-page, it would just want to always stuff a xen > struct, pass that to the boot entry, and then expect always the guest > kernel to always parse this? Correct. Why do you think we should lumber non-Linux guests with a Linux-specific boot protocol? Quite apart from the fact that Linux is second to the table here (FreeBSD was first), it causes inappropriate linkage between the toolstack and the version of Linux wishing to be booted. > > If true, then by no means, no matter how hard we try, and no matter > what we do on the Linux front to help clean things up will we be able > to have a unified bare metal / Xen entry. I'm noting it could be > possible though provided we do just set the zero page, the subarch to > Xen and subarch_data to the Xen custom data structure. All you need is a very small stub which starts per the DMLite ABI, sets up an appropriate zero_page, and jumps to the native entrypoint. However, this stub belongs in Linux, not in the Xen toolstack. That way, when the Linux boot protocol is modified, both sides can be updated accordingly. ~Andrew
>However, this stub belongs in Linux, not in the Xen toolstack. That >way, when the Linux boot protocol is modified, both sides can be >updated >accordingly. I would add that this idea is borrowed from the EFI stub code that Linux has which also constructs the boot parameter structure when invoked (either from firmware or from EFI shell).
On January 23, 2016 7:34:33 AM PST, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: > >>However, this stub belongs in Linux, not in the Xen toolstack. That >>way, when the Linux boot protocol is modified, both sides can be >>updated >>accordingly. > >I would add that this idea is borrowed from the EFI stub code that >Linux has which also constructs the boot parameter structure when >invoked (either from firmware or from EFI shell). There is a huge difference though: EFI is a widely used multivendor industry standard. You are taking about something Xen-specific, and which in good Xen tradition isn't even documented, apparently (did we ever get documentation for the hypervisor ABI?) Asking "why burden Xen with something Linux-specific" is a pretty extreme case of the tail wagging the dog. That being said, before any code can be put anywhere, it needs to be written. We can argue where to put it later. We went through this process with the EFI stub, too: a standalone implementation (efilinux) first.
On January 23, 2016 11:01:06 AM EST, "H. Peter Anvin" <hpa@zytor.com> wrote: >On January 23, 2016 7:34:33 AM PST, Konrad Rzeszutek Wilk ><konrad.wilk@oracle.com> wrote: >> >>>However, this stub belongs in Linux, not in the Xen toolstack. That >>>way, when the Linux boot protocol is modified, both sides can be >>>updated >>>accordingly. >> >>I would add that this idea is borrowed from the EFI stub code that >>Linux has which also constructs the boot parameter structure when >>invoked (either from firmware or from EFI shell). > >There is a huge difference though: EFI is a widely used multivendor >industry standard. You are taking about something Xen-specific, and >which in good Xen tradition isn't even documented, apparently (did we >ever get documentation for the hypervisor ABI?) > >Asking "why burden Xen with something Linux-specific" is a pretty >extreme case of the tail wagging the dog. > >That being said, before any code can be put anywhere, it needs to be >written. We can argue where to put it later. We went through this >process with the EFI stub, too: a standalone implementation (efilinux) >first. http://lists.xenproject.org/archives/html/xen-devel/2015-12/msg01793.html I believe is the latest version. Roger (CCed) has probably an updated one.
On January 23, 2016 8:12:23 AM PST, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote: >On January 23, 2016 11:01:06 AM EST, "H. Peter Anvin" <hpa@zytor.com> >wrote: >>On January 23, 2016 7:34:33 AM PST, Konrad Rzeszutek Wilk >><konrad.wilk@oracle.com> wrote: >>> >>>>However, this stub belongs in Linux, not in the Xen toolstack. That >>>>way, when the Linux boot protocol is modified, both sides can be >>>>updated >>>>accordingly. >>> >>>I would add that this idea is borrowed from the EFI stub code that >>>Linux has which also constructs the boot parameter structure when >>>invoked (either from firmware or from EFI shell). >> >>There is a huge difference though: EFI is a widely used multivendor >>industry standard. You are taking about something Xen-specific, and >>which in good Xen tradition isn't even documented, apparently (did we >>ever get documentation for the hypervisor ABI?) >> >>Asking "why burden Xen with something Linux-specific" is a pretty >>extreme case of the tail wagging the dog. >> >>That being said, before any code can be put anywhere, it needs to be >>written. We can argue where to put it later. We went through this >>process with the EFI stub, too: a standalone implementation (efilinux) >>first. > >http://lists.xenproject.org/archives/html/xen-devel/2015-12/msg01793.html > >I believe is the latest version. Roger (CCed) has probably an updated >one. I suspect you should write a noninteractive bootloader as a reference implementation, and then consider porting Grub2 and maybe Syslinux to your ABI for those that want a full featured interactive bootloader compatible with the normal management.
El 23/01/16 a les 17.12, Konrad Rzeszutek Wilk ha escrit: > On January 23, 2016 11:01:06 AM EST, "H. Peter Anvin" <hpa@zytor.com> wrote: >> On January 23, 2016 7:34:33 AM PST, Konrad Rzeszutek Wilk >> <konrad.wilk@oracle.com> wrote: >>> >>>> However, this stub belongs in Linux, not in the Xen toolstack. That >>>> way, when the Linux boot protocol is modified, both sides can be >>>> updated >>>> accordingly. >>> >>> I would add that this idea is borrowed from the EFI stub code that >>> Linux has which also constructs the boot parameter structure when >>> invoked (either from firmware or from EFI shell). >> >> There is a huge difference though: EFI is a widely used multivendor >> industry standard. You are taking about something Xen-specific, and >> which in good Xen tradition isn't even documented, apparently (did we >> ever get documentation for the hypervisor ABI?) >> >> Asking "why burden Xen with something Linux-specific" is a pretty >> extreme case of the tail wagging the dog. >> >> That being said, before any code can be put anywhere, it needs to be >> written. We can argue where to put it later. We went through this >> process with the EFI stub, too: a standalone implementation (efilinux) >> first. > > http://lists.xenproject.org/archives/html/xen-devel/2015-12/msg01793.html > > I believe is the latest version. Roger (CCed) has probably an updated one. Yes, the technical side has not changed at all. I've just send an updated version with some wording changes: http://lists.xenproject.org/archives/html/xen-devel/2016-01/msg03029.html Roger.
On 01/22/2016 07:30 PM, Andrew Cooper wrote: > On 22/01/2016 23:32, Luis R. Rodriguez wrote: >> On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote: >>> + /* >>> + * See Documentation/x86/boot.txt. >>> + * >>> + * Version 2.12 supports Xen entry point but we will use default x86/PC >>> + * environment (i.e. hardware_subarch 0). >>> + */ >>> + xen_hvmlite_boot_params.hdr.version = 0x212; >>> + xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */ >>> +} >> I realize PV got away with setting up boot_params on C code but best >> ask now that this new code is being introduced: why can't we just have >> the Xen hypervisor fill this in? It'd save us all this code. > I agree that this looks to be a mess. Having said that, the DMLite boot > protocol is OS agnostic, and will be staying that way. > > It happens to look suspiciously like multiboot; a flat 32bit protected > mode entry (at a location chosen in an ELF note), with %ebx pointing to > an in-ram structure containing things like a command line and module list. > > I would have though the correct way to do direct Linux support would be > to have a very small init stub which constructs an appropriate zero > page, and lets the native entry point get on with things. Which is really what hvmlite_start_xen()->xen_prepare_hvmlite()->hvmlite_bootparams() is doing. Not much more than that (for 64-bit it also loads identity mapping because that's what startup_64 wants) -boris > > This covers the usecase where people wish to boot a specific Linux > kernel straight out of the dom0 filesystem. > > For the alternative usecase of general OS support, dom0 would boot > something such as grub2 as the DMLite "kernel", at which point all > stooging around in the guests filesystem is done from guest context, > rather than control context (mitigating a substantial attack surface). > > ~Andrew
On 01/22/2016 06:32 PM, Luis R. Rodriguez wrote: > On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote: > >> +/* >> + * This routine (and those that it might call) should not use >> + * anything that lives in .bss since that segment will be cleared later >> + */ >> +void __init xen_prepare_hvmlite(void) >> +{ >> + u32 eax, ecx, edx, msr; >> + u64 pfn; >> + >> + cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx); >> + pfn = __pa(hypercall_page); >> + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); >> + >> + pv_info.name = "Xen HVMlite"; >> + xen_domain_type = XEN_HVM_DOMAIN; >> + xen_hvmlite = 1; >> + >> + x86_init.oem.arch_setup = xen_init_kernel; >> + x86_init.oem.banner = xen_banner; >> + >> + hvmlite_bootparams(); >> +} >> +#endif > If the boot_params.hdr.hardware_subarch_data pointed to a custom > struct then the first C entry point for Xen could shuffle this and > set this too, by still using less asm entry helpers. We'd still > need this run but with the linker table I think we could have > a stub small stub for hvm run, it would not be a call from asm. Perhaps, but someone would still have to set hardware_subarch. And it's hvmlite_bootparams() that does it. And that's not sufficient, I think. There are still some things that trampoline code sets up (e.g. page tables for 64-bit). -boris
On Mon, Jan 25, 2016 at 11:08:47AM -0500, Boris Ostrovsky wrote: > On 01/22/2016 06:32 PM, Luis R. Rodriguez wrote: > >On Fri, Jan 22, 2016 at 04:35:50PM -0500, Boris Ostrovsky wrote: > > > >>+/* > >>+ * This routine (and those that it might call) should not use > >>+ * anything that lives in .bss since that segment will be cleared later > >>+ */ > >>+void __init xen_prepare_hvmlite(void) > >>+{ > >>+ u32 eax, ecx, edx, msr; > >>+ u64 pfn; > >>+ > >>+ cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx); > >>+ pfn = __pa(hypercall_page); > >>+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); > >>+ > >>+ pv_info.name = "Xen HVMlite"; > >>+ xen_domain_type = XEN_HVM_DOMAIN; > >>+ xen_hvmlite = 1; > >>+ > >>+ x86_init.oem.arch_setup = xen_init_kernel; > >>+ x86_init.oem.banner = xen_banner; > >>+ > >>+ hvmlite_bootparams(); > >>+} > >>+#endif > >If the boot_params.hdr.hardware_subarch_data pointed to a custom > >struct then the first C entry point for Xen could shuffle this and > >set this too, by still using less asm entry helpers. We'd still > >need this run but with the linker table I think we could have > >a stub small stub for hvm run, it would not be a call from asm. > > Perhaps, but someone would still have to set hardware_subarch. And > it's hvmlite_bootparams() that does it. No, Xen would do it as well, essentially all of hvmlite_bootparams() could be done in Xen. > And that's not sufficient, I think. There are still some things that > trampoline code sets up (e.g. page tables for 64-bit). Sure, but only what is required, that should be rather smaller. Luis
On 01/25/16 13:12, Luis R. Rodriguez wrote: >> >> Perhaps, but someone would still have to set hardware_subarch. And >> it's hvmlite_bootparams() that does it. > > No, Xen would do it as well, essentially all of hvmlite_bootparams() could be > done in Xen. > Or a stub code. -hpa
On Sat, Jan 23, 2016 at 02:49:36PM +0000, Andrew Cooper wrote: > On 23/01/2016 00:55, Luis R. Rodriguez wrote: > > On Fri, Jan 22, 2016 at 4:30 PM, Andrew Cooper > > <andrew.cooper3@citrix.com> wrote: > >> the DMLite boot > >> protocol is OS agnostic, and will be staying that way. > > What's the DMLite boot protocol? > > It is a statement of the initial state of a DMLite container. I see. > > Is that the protocol that is defined by Xen to boot Xen guests and dom0? > > Technically it is toolstack which constructs this initial state, but > broadly yes. I see thanks. > > Is this well documented somewhere? > > There is a patch out on the list formalising the ABI in writing. (Roger: > ping?) Thanks for this. > > To be clear are you saying that by no means will Xen change to instead > > of setting a, say zero-page, it would just want to always stuff a xen > > struct, pass that to the boot entry, and then expect always the guest > > kernel to always parse this? > > Correct. Why do you think we should lumber non-Linux guests with a > Linux-specific boot protocol? You should by no means do that, agreed, however one should have the flexibility to support such types of OSes that use page specific settings for the OS. The way I see it this is an OS feature and obviously it can vary. > Quite apart from the fact that Linux is second to the table here > (FreeBSD was first), This means little but just history. > it causes inappropriate linkage between the > toolstack and the version of Linux wishing to be booted. There are ways to do it in such a way that it does not create dependency issues on Linux specific code. 0) The Xen way and EFI way A generic data structure is passed to the entry point on the kernel to load the kernel. The kernel in turn must parse this generic specific struct and interprets it and translate it to the kernel specific values. 1) The qemu way: One way is to simply not refer to the boot_params data structures but IMHO that produces really ugly code. The qemu folks did it this way, so qemu does not use arch/x86/include/uapi/asm/bootparam.h in any way, instead it uses offsets from a char *header. Refer to load_linux() in hw/i386/pc.c, for instance: header[0x211] |= 0x80; /* CAN_USE_HEAP */ 2) The grub way: Another way, which grub uses, is to define their own data structures based on arch/x86/include/uapi/asm/bootparam.h, with their own data types, and refer to them in code, for instance refer to grub_cmd_linux() on the grub file grub-core/loader/i386/pc/linux.c and its copy definition of the header definition in include/grub/i386/linux.h. lh.loadflags |= GRUB_LINUX_FLAG_CAN_USE_HEAP; The way lguest does it in the lguest launcher is IMHO the cleanest of course, but it includes asm/bootparam.h and uses that in code: /* Boot protocol version: 2.07 supports the fields for lguest. */ boot->hdr.version = 0x207; But that does mean copying over or using the bootparam.h file and using linux data types. 3) Merge of xen way and using subarch_data Only set the subarch and subarch_data pointer, and have the kernel then read the generic data structure and parse it as it used to, the benefit is sharing a common entry point. No one uses this yet. But I think its a reasonable compromise. Perhaps there are other ways as well. > > If true, then by no means, no matter how hard we try, and no matter > > what we do on the Linux front to help clean things up will we be able > > to have a unified bare metal / Xen entry. I'm noting it could be > > possible though provided we do just set the zero page, the subarch to > > Xen and subarch_data to the Xen custom data structure. > > All you need is a very small stub which starts per the DMLite ABI, sets > up an appropriate zero_page, and jumps to the native entrypoint. Right. > However, this stub belongs in Linux, not in the Xen toolstack. That > way, when the Linux boot protocol is modified, both sides can be updated > accordingly. Makes sense the different entry points just seems best avoided if possible. Luis
On 01/25/2016 04:21 PM, H. Peter Anvin wrote: > On 01/25/16 13:12, Luis R. Rodriguez wrote: >>> Perhaps, but someone would still have to set hardware_subarch. And >>> it's hvmlite_bootparams() that does it. >> No, Xen would do it as well, essentially all of hvmlite_bootparams() could be >> done in Xen. >> > Or a stub code. This patch in fact is the stub for Xen HVMlite guests, after we are done with it we jump to bare-metal startup code (i.e startup_32|64) -boris
On 01/25/2016 05:19 PM, Luis R. Rodriguez wrote: > On Sat, Jan 23, 2016 at 02:49:36PM +0000, Andrew Cooper wrote: > > >> it causes inappropriate linkage between the >> toolstack and the version of Linux wishing to be booted. > There are ways to do it in such a way that it does not create dependency issues > on Linux specific code. > > > 0) The Xen way and EFI way > > A generic data structure is passed to the entry point on the kernel to > load the kernel. The kernel in turn must parse this generic specific struct > and interprets it and translate it to the kernel specific values. > > 1) The qemu way: > > One way is to simply not refer to the boot_params data structures but IMHO that > produces really ugly code. The qemu folks did it this way, so qemu does not use > arch/x86/include/uapi/asm/bootparam.h in any way, instead it uses offsets from > a char *header. Refer to load_linux() in hw/i386/pc.c, for instance: > > header[0x211] |= 0x80; /* CAN_USE_HEAP */ > > 2) The grub way: > > Another way, which grub uses, is to define their own data structures based > on arch/x86/include/uapi/asm/bootparam.h, with their own data types, and refer > to them in code, for instance refer to grub_cmd_linux() on the grub file > grub-core/loader/i386/pc/linux.c and its copy definition of the header definition > in include/grub/i386/linux.h. > > lh.loadflags |= GRUB_LINUX_FLAG_CAN_USE_HEAP; > > The way lguest does it in the lguest launcher is IMHO the cleanest of course, > but it includes asm/bootparam.h and uses that in code: > > /* Boot protocol version: 2.07 supports the fields for lguest. */ > boot->hdr.version = 0x207; > > But that does mean copying over or using the bootparam.h file and using > linux data types. > > 3) Merge of xen way and using subarch_data > > Only set the subarch and subarch_data pointer, and have the kernel then > read the generic data structure and parse it as it used to, the benefit > is sharing a common entry point. > > No one uses this yet. But I think its a reasonable compromise. > > Perhaps there are other ways as well. > >>> If true, then by no means, no matter how hard we try, and no matter >>> what we do on the Linux front to help clean things up will we be able >>> to have a unified bare metal / Xen entry. I'm noting it could be >>> possible though provided we do just set the zero page, the subarch to >>> Xen and subarch_data to the Xen custom data structure. >> All you need is a very small stub which starts per the DMLite ABI, sets >> up an appropriate zero_page, and jumps to the native entrypoint. > Right. I am trying to understand what your objection is to what is proposed in this patch. It is the size of the stub? If yes -- what would you like to leave out to be done later? -boris > >> However, this stub belongs in Linux, not in the Xen toolstack. That >> way, when the Linux boot protocol is modified, both sides can be updated >> accordingly. > Makes sense the different entry points just seems best avoided if possible. > > Luis
On Mon, Jan 25, 2016 at 05:28:08PM -0500, Boris Ostrovsky wrote: > On 01/25/2016 04:21 PM, H. Peter Anvin wrote: > >On 01/25/16 13:12, Luis R. Rodriguez wrote: > >>>Perhaps, but someone would still have to set hardware_subarch. And > >>>it's hvmlite_bootparams() that does it. > >>No, Xen would do it as well, essentially all of hvmlite_bootparams() could be > >>done in Xen. > >> > >Or a stub code. > > This patch in fact is the stub for Xen HVMlite guests, after we are > done with it we jump to bare-metal startup code (i.e startup_32|64) Right the point is the stub need not be in Linux, I'll explain in the other thread where I provided more details on the different known approaches. Luis
On Tue, Jan 26, 2016 at 10:34 AM, Luis R. Rodriguez <mcgrof@suse.com> wrote: > On Mon, Jan 25, 2016 at 05:28:08PM -0500, Boris Ostrovsky wrote: >> On 01/25/2016 04:21 PM, H. Peter Anvin wrote: >> >On 01/25/16 13:12, Luis R. Rodriguez wrote: >> >>>Perhaps, but someone would still have to set hardware_subarch. And >> >>>it's hvmlite_bootparams() that does it. >> >>No, Xen would do it as well, essentially all of hvmlite_bootparams() could be >> >>done in Xen. >> >> >> >Or a stub code. >> >> This patch in fact is the stub for Xen HVMlite guests, after we are >> done with it we jump to bare-metal startup code (i.e startup_32|64) > > Right the point is the stub need not be in Linux, I'll explain in the other > thread where I provided more details on the different known approaches. > ISTM if the Xen ABI-specified entry point has a different convention than the Linux native entry, then the stub should live in Linux. It would be just a couple if lines of code, right? The issue that caused headaches in the past isn't that there's code that's executed only on native, it's that there are whole big functions that are executed only on native for no good reason and that aren't clearly marked. If we had native_start_kernel and xen_start_kernel, and they both called very quickly in to common_start_kernel, it would be very clear what's going on. --Andy
On 01/26/2016 01:46 PM, Andy Lutomirski wrote: > On Tue, Jan 26, 2016 at 10:34 AM, Luis R. Rodriguez <mcgrof@suse.com> wrote: >> On Mon, Jan 25, 2016 at 05:28:08PM -0500, Boris Ostrovsky wrote: >>> On 01/25/2016 04:21 PM, H. Peter Anvin wrote: >>>> On 01/25/16 13:12, Luis R. Rodriguez wrote: >>>>>> Perhaps, but someone would still have to set hardware_subarch. And >>>>>> it's hvmlite_bootparams() that does it. >>>>> No, Xen would do it as well, essentially all of hvmlite_bootparams() could be >>>>> done in Xen. >>>>> >>>> Or a stub code. >>> This patch in fact is the stub for Xen HVMlite guests, after we are >>> done with it we jump to bare-metal startup code (i.e startup_32|64) >> Right the point is the stub need not be in Linux, I'll explain in the other >> thread where I provided more details on the different known approaches. >> > ISTM if the Xen ABI-specified entry point has a different convention > than the Linux native entry, then the stub should live in Linux. It > would be just a couple if lines of code, right? It's not exactly a couple of lines but it's not large neither. It mainly sets up boot_params (similar to what make_boot_params() does for EFI). Plus, for 64-bit, it loads identity page tables and switches to long mode. And then jumps to bare-meta startup code. > > The issue that caused headaches in the past isn't that there's code > that's executed only on native, it's that there are whole big > functions that are executed only on native for no good reason and that > aren't clearly marked. This won't happen with HVMlite. > > If we had native_start_kernel and xen_start_kernel, and they both > called very quickly in to common_start_kernel, it would be very clear > what's going on. What is now xen_start_kernel() is no longer the entry point for HVMlite. It is called as x86_init.oem.arch_setup() (or I may even move it to x86_hyper_xen.init_platform() or something like that). -boris
On Tue, Jan 26, 2016 at 11:00 AM, Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote: > On 01/26/2016 01:46 PM, Andy Lutomirski wrote: >> >> On Tue, Jan 26, 2016 at 10:34 AM, Luis R. Rodriguez <mcgrof@suse.com> >> wrote: >>> >>> On Mon, Jan 25, 2016 at 05:28:08PM -0500, Boris Ostrovsky wrote: >>>> >>>> On 01/25/2016 04:21 PM, H. Peter Anvin wrote: >>>>> >>>>> On 01/25/16 13:12, Luis R. Rodriguez wrote: >>>>>>> >>>>>>> Perhaps, but someone would still have to set hardware_subarch. And >>>>>>> it's hvmlite_bootparams() that does it. >>>>>> >>>>>> No, Xen would do it as well, essentially all of hvmlite_bootparams() >>>>>> could be >>>>>> done in Xen. >>>>>> >>>>> Or a stub code. >>>> >>>> This patch in fact is the stub for Xen HVMlite guests, after we are >>>> done with it we jump to bare-metal startup code (i.e startup_32|64) >>> >>> Right the point is the stub need not be in Linux, I'll explain in the >>> other >>> thread where I provided more details on the different known approaches. >>> >> ISTM if the Xen ABI-specified entry point has a different convention >> than the Linux native entry, then the stub should live in Linux. It >> would be just a couple if lines of code, right? > > > It's not exactly a couple of lines but it's not large neither. It mainly > sets up boot_params (similar to what make_boot_params() does for EFI). Plus, > for 64-bit, it loads identity page tables and switches to long mode. And > then jumps to bare-meta startup code. This terse summary provides an example of the issue I'm highlighting. Even though the stub is small we undermine its impact! Although the stub is small the different entry point also enables subtle additions which are required, although they are minimal they are important and if not considered for new x86 features causes regressions. I don't care what people tell me about "this should have been caught by code review, and no one CC'd me -- whaa!" -- this is a silly expectation and I think we should do better. Case in point: xen_start_kernel() has seen regressions now on both cr4_init_shadow() which you forgot to add to Xen Andy -- and later Boris fixed. Another example: the latest one is a kasan init -- which to this day remains fucked up -- a Kasan enabled PV guest crashes, and fixing is no where in sight. I flagged this a while ago, and I think we should put a proactive measure in place for that. The linker table stuff I'm doing was not for kicks -- its a means to an end here, and although I can't yet read subarch at early init, if we had it, then things link xen_start_kernel() could just be an x86 init stub, it'd be the first stub Xen runs. You can keep whatever boot_params setup stub it does today, even though I think its cleaner done in Xen, but at the very least it could at least instead just *read* the Xen custom generic structure to parse and set boot_params by using subarch_data pointer. I also have been reading the history of code changes on the other entry points in Linux and even though the code is small I see tons of commits in there for minor but critical fixes all over, likewise comments and recommendations and visions to unify / share code. I refuse to accept that we should undermine the issues of a new entry point or leaving the situation as-is with dual entry points for x86_64 / xen if we can instead just cleanly unify even asm entry points. >> The issue that caused headaches in the past isn't that there's code >> that's executed only on native, it's that there are whole big >> functions that are executed only on native for no good reason and that >> aren't clearly marked. Its more than that, as I noted. > This won't happen with HVMlite. And that's fine, its just I think we can avoid yet-another entry -- even if we still are coming into a common C entry later. >> If we had native_start_kernel and xen_start_kernel, and they both >> called very quickly in to common_start_kernel, it would be very clear >> what's going on. > > What is now xen_start_kernel() is no longer the entry point for HVMlite. It > is called as x86_init.oem.arch_setup() (or I may even move it to > x86_hyper_xen.init_platform() or something like that). And that's a huge win! Yet I invite us to consider other prospects to even merge more and simplify more. Luis
On Mon, Jan 25, 2016 at 05:55:02PM -0500, Boris Ostrovsky wrote: > On 01/25/2016 05:19 PM, Luis R. Rodriguez wrote: > >On Sat, Jan 23, 2016 at 02:49:36PM +0000, Andrew Cooper wrote: > > > > > >>it causes inappropriate linkage between the > >>toolstack and the version of Linux wishing to be booted. > >There are ways to do it in such a way that it does not create dependency issues > >on Linux specific code. > > > > > >0) The Xen way and EFI way > > > >A generic data structure is passed to the entry point on the kernel to > >load the kernel. The kernel in turn must parse this generic specific struct > >and interprets it and translate it to the kernel specific values. > > > >1) The qemu way: > > > >One way is to simply not refer to the boot_params data structures but IMHO that > >produces really ugly code. The qemu folks did it this way, so qemu does not use > >arch/x86/include/uapi/asm/bootparam.h in any way, instead it uses offsets from > >a char *header. Refer to load_linux() in hw/i386/pc.c, for instance: > > > >header[0x211] |= 0x80; /* CAN_USE_HEAP */ > > > >2) The grub way: > > > >Another way, which grub uses, is to define their own data structures based > >on arch/x86/include/uapi/asm/bootparam.h, with their own data types, and refer > >to them in code, for instance refer to grub_cmd_linux() on the grub file > >grub-core/loader/i386/pc/linux.c and its copy definition of the header definition > >in include/grub/i386/linux.h. > > > >lh.loadflags |= GRUB_LINUX_FLAG_CAN_USE_HEAP; > > > >The way lguest does it in the lguest launcher is IMHO the cleanest of course, > >but it includes asm/bootparam.h and uses that in code: > > > >/* Boot protocol version: 2.07 supports the fields for lguest. */ > >boot->hdr.version = 0x207; > > > >But that does mean copying over or using the bootparam.h file and using > >linux data types. > > > >3) Merge of xen way and using subarch_data > > > >Only set the subarch and subarch_data pointer, and have the kernel then > >read the generic data structure and parse it as it used to, the benefit > >is sharing a common entry point. > > > >No one uses this yet. But I think its a reasonable compromise. > > > >Perhaps there are other ways as well. > > > >>>If true, then by no means, no matter how hard we try, and no matter > >>>what we do on the Linux front to help clean things up will we be able > >>>to have a unified bare metal / Xen entry. I'm noting it could be > >>>possible though provided we do just set the zero page, the subarch to > >>>Xen and subarch_data to the Xen custom data structure. > >>All you need is a very small stub which starts per the DMLite ABI, sets > >>up an appropriate zero_page, and jumps to the native entrypoint. > >Right. > > > I am trying to understand what your objection is to what is proposed > in this patch. It is the size of the stub? If yes -- what would you > like to leave out to be done later? I explained that here briefly: > >>However, this stub belongs in Linux, not in the Xen toolstack. That > >>way, when the Linux boot protocol is modified, both sides can be updated > >>accordingly. > > >Makes sense the different entry points just seems best avoided if possible. But let me elaborate: We want to strive towards avoiding new entry points on Linux at all costs, there have been historic issues with having multiple entry points, some of these may relate to Xen, some not so much but one should not ignore history. A few related patches, this incldues EFI entry points since Konrad brings up Xen's approach was inspired by that as well. Read from bottom up. Kasan: not yet fixed, the feature completely ignored Xen PV guests as a possible candidate for x86. Xen PV guests with Kasan enabled crash. The fix is not so trivial. What I'd prefer is for kasan to have a feature to be disabled at run time, and with that in place we can simply at early boot disable Kasan early on boot on Xen until Kasan is properly implemented and supported on Xen. 5054daa285beaf706f051fbd395dc36c9f0f907f - x86/xen: Initialize cr4 shadow for 64-bit PV(H) guests Due to xen's separate entry point and the addition of a cr4 shadow, only native had its feature added, this made Xen crash. f3670394c29ff3730638762c1760fd2f624e6d7b - Revert "x86/efi: Fixup GOT in all boot code paths" Fails to boot LInus' Sony Vaio Pro 11 9cb0e394234d244fe5a97e743ec9dd7ddff7e64b - x86/efi: Fixup GOT in all boot code paths Because of the multiple boot entry points the fixup for GOT needs to be done in different places. 7e8213c1f3acc064aef37813a39f13cbfe7c3ce7 - x86/efi: Correct EFI boot stub use of code32_start Title says it. b8ff87a6158886771677e6dc8139bac6e3cba717 - x86/efi: Firmware agnostic handover entry points Fixes the issue of a mismatch on 32-bit and 64-bit between firmware and kernel run. 99f857db8857aff691c51302f93648263ed07eb1 - x86, build: Dynamically find entry points in compressed startup code Boot loaders abuse static entry point. Perhaps not directly related to Xen but its a lesson to learn that regardless of what you document people may abuse the entry points in very unexpected ways. f791620fa7517e1045742c475a7f005db9a634b8 - x86, efi: Fix 32-bit EFI handover protocol entry point After the EFI handover protocol was added 32-bit boots had an issue. 9ca8f72a9297f2052d806bd1111e176533aa69bd - x86, efi: Handover Protocol *Only* use an EFI boot stub for EFI capable to avoid duplicate code in boot loaders to set up and boot Linux. 51b26ada79b605ed709ddcedbb6012e8f8e0ebed - x86: unify arch/x86/boot/compressed/vmlinux_*.lds Linus unifies vmlinux_*.lds and calls to unify startup_32() and startup_64(). This hasn't been done yet. -- As someone new reading all this code / commits all I can do is cringe when I see yet-another-entry-point being added. I think we can do better, so we should strive to that unless there are real technical impossibilities here. What I'm proposing? 1) Lets see if we can put a proactive stop-gap to issues such as the cr4 shadow bug and Kasan bugs from creeping in. This should not only help PV but perhaps HVMLite. If you'd like to help with that refer to this thread: http://lkml.kernel.org/r/CAB=NE6VTCRCazcNpCdJ7pN1eD3=x_fcGOdH37MzVpxkKEN5esw@mail.gmail.com 2) asm code sharing prospects - rebranding of PVH to HVMlite It sounds like HVMlite is really just a clean PVH implementation so we'll be doing this it seems according to this approach: a) Since PVH brand is taken add new new Xen clean solution as "HVMLite implementation" b) Drop PVH c) Re-brand HVMLite as PVH Is there really no asm code to share between PV and HVMlite ? How about between PV and other Linux asm ? Specifically I'm talking about a huge new entry point called hvmlite_start_xen() of asm code. How much sharing or duplication is being avoided here? 3) C code sharing prospects: rebranding of PVH to HVMlite This code seems to share a lot from PV guest. Can we combine? 4) hardware_subarch, hardware_subarch_data and future prospects Your patch relies on a *new* Linux entry point. Sure, we had one for EFI, and sure there is another for Xen PV, but since you're just rebranding PVH to HVMlite and given historic issues with any new Linux entry points I'd like for us to take a breather and evaluate the extent our lofty unification goals, and how the x86 boot protocol could help with this already. Now, perhaps today it may seem as difficult to unify asm entry points today all over, but if we can take baby steps towards that I think that should be seriously evaluated. For instance, we should consider on relying on hardware_subarch and hardware_subarch_data to fetch the hvmlite_start_info by taking advantage of the x86 boot protocol. The hvmlite_start_info is what Xen uses to send us info about the guest as your patch proposes (this matches the Xen PV style entry), we stash it into a standard Linux boot_params variable called xen_hvmlite_boot_params for the Xen guest in hvmlite_bootparam(). This data structure and code seems to match very much the PV guest structure, why not just use a union and differentiate on PV subtype ? If you want to avoid a lot of PV calls for HVMlite it seems you could just take advantage of subarch Xen type, and differentiate on the subarch_data within Xen code to make that code just PV sepecific. I only see gains by using the Xen subarch, so would like to know why PC is being pushed. Luis
On 01/26/2016 03:30 PM, Luis R. Rodriguez wrote: > > What I'm proposing? > > 1) Lets see if we can put a proactive stop-gap to issues such as the cr4 shadow > bug and Kasan bugs from creeping in. This should not only help PV but perhaps > HVMLite. If you'd like to help with that refer to this thread: > > http://lkml.kernel.org/r/CAB=NE6VTCRCazcNpCdJ7pN1eD3=x_fcGOdH37MzVpxkKEN5esw@mail.gmail.com > > 2) asm code sharing prospects - rebranding of PVH to HVMlite > > It sounds like HVMlite is really just a clean PVH implementation so we'll > be doing this it seems according to this approach: > > a) Since PVH brand is taken add new new Xen clean solution as "HVMLite implementation" > b) Drop PVH > c) Re-brand HVMLite as PVH > > Is there really no asm code to share between PV and HVMlite ? Not the early boot code. > How about between > PV and other Linux asm ? Specifically I'm talking about a huge new entry point > called hvmlite_start_xen() of asm code. How much sharing or duplication > is being avoided here? I don't see that we can share much here. Especially given that hvmlite_start() is a 32-bit entry point so we can't start a 64-bit PV guest. I suppose we can find some common (C) code that sets boot_params but on the first sight they are quite different too. (And whether it's huge is a matter of opinion ;-). It is more than a couple of instructions, I'll give you that) > > 3) C code sharing prospects: rebranding of PVH to HVMlite > > This code seems to share a lot from PV guest. Can we combine? HVMlite closely follows HVM (and baremetal) code path. There is some common setup code with PV (which is what the second patch in this series is trying to factor out) > > 4) hardware_subarch, hardware_subarch_data and future prospects > > Your patch relies on a *new* Linux entry point. Sure, we had one > for EFI, and sure there is another for Xen PV, but since you're just > rebranding PVH to HVMlite and given historic issues with any new > Linux entry points I'd like for us to take a breather and evaluate > the extent our lofty unification goals, and how the x86 boot protocol > could help with this already. I am not sure I see how you can avoid having new entry point. For example, all HVMlite guests start in 32-bit mode. Who will switch to long mode? > > Now, perhaps today it may seem as difficult to unify asm entry points > today all over, but if we can take baby steps towards that I think that > should be seriously evaluated. > > For instance, we should consider on relying on hardware_subarch and > hardware_subarch_data to fetch the hvmlite_start_info by taking advantage of > the x86 boot protocol. The hvmlite_start_info is what Xen uses to send us info > about the guest as your patch proposes (this matches the Xen PV style entry), > we stash it into a standard Linux boot_params variable called > xen_hvmlite_boot_params for the Xen guest in hvmlite_bootparam(). This > data structure and code seems to match very much the PV guest structure, No, the two don't match at all. > why not just use a union and differentiate on PV subtype ? If you want to avoid > a lot of PV calls for HVMlite it seems you could just take advantage of > subarch Xen type, and differentiate on the subarch_data within Xen code > to make that code just PV sepecific. > > I only see gains by using the Xen subarch, so would like to know why PC is > being pushed. It's not that subarch 0 is being pushed here. It's that I don't see how it can be useful for this particular guest type. Maybe as we add new features (or discover something that we've missed up till now) we can switch to using it. If you think we should delay initializing boot_params until then --- I will disagree: boot_params are used before we look at subarch and I don't believe it makes sense to pick and choose what to initialize before and what can wait. (And I am not sure it can be useful on PV neither, at least the way it is used now. You will not reach the point in the (32-bit) code where it is tested. You will die way earlier (I think on startup_32()'s fourth instruction).) -boris
On Tue, Jan 26, 2016 at 04:51:38PM -0500, Boris Ostrovsky wrote: > On 01/26/2016 03:30 PM, Luis R. Rodriguez wrote: > hvmlite_start() is a 32-bit entry point [...] > > >4) hardware_subarch, hardware_subarch_data and future prospects > > > >Your patch relies on a *new* Linux entry point. Sure, we had one > >for EFI, and sure there is another for Xen PV, but since you're just > >rebranding PVH to HVMlite and given historic issues with any new > >Linux entry points I'd like for us to take a breather and evaluate > >the extent our lofty unification goals, and how the x86 boot protocol > >could help with this already. > > I am not sure I see how you can avoid having new entry point. For > example, all HVMlite guests start in 32-bit mode. Who will switch to > long mode? x86 i386 entry points need to have code to do all that stuff, this can happen for instance when you boot x86_64 from a 32-bit boot loader, and I think other things as well are possible that trigger this as well. > >why not just use a union and differentiate on PV subtype ? If you want to avoid > >a lot of PV calls for HVMlite it seems you could just take advantage of > >subarch Xen type, and differentiate on the subarch_data within Xen code > >to make that code just PV sepecific. > > > >I only see gains by using the Xen subarch, so would like to know why PC is > >being pushed. > > It's not that subarch 0 is being pushed here. It's that I don't see > how it can be useful for this particular guest type. Maybe as we add > new features (or discover something that we've missed up till now) > we can switch to using it. If you think we should delay initializing > boot_params until then --- I will disagree: boot_params are used > before we look at subarch and I don't believe it makes sense to pick > and choose what to initialize before and what can wait. subarch is part of boot_params, so not sure what I mean by an issue in timing here. The question is if its set and then how early can you possibly read the subarch from the boot_params. > (And I am not sure it can be useful on PV neither, Well.. > at least the way it is used now. that is the issue.. If we get access to boot_params on early boot then we can simply share the x86_64 entry point between PVH, PV, and native x86_64 with what I'm proposing and some minor extensions. > You will not reach the point in the (32-bit) code > where it is tested. You will die way earlier (I think on > startup_32()'s fourth instruction).) Its a bit different requirement for the subarch for PV/PVH and for HVMlite. Given what you have explained things are bit clearer now and I see the issue. You go: hvmlite_start_xen() --> HVM stub startup_64() | (startup_32() Note at the end of startup_32() though we have a neat asm set of entries that depend on the subarch type. Perhaps we should have a PV type early on startup_32() ? And/or I wonder if we can work off of the EFI boot loader code. Half baked thoughts for now, sorry have to go. Luis
On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote: > You go: > > hvmlite_start_xen() --> > HVM stub > startup_64() | (startup_32() Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not be legacy or crap / old, so we'd need a way to catch it if we should not use that code for this PV type. This begs the question, are you also sure other callers in startup_32() or startup_64() might be OK as well where previously guarded with pv_enabled() ? Luis
On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote: > > On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> wrote: > > You go: > > > > hvmlite_start_xen() --> > > HVM stub > > startup_64() | (startup_32() > > Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to > rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not > be legacy or crap / old, so we'd need a way to catch it if we should > not use that code for this PV type. This begs the question, are you > also sure other callers in startup_32() or startup_64() might be OK as > well where previously guarded with pv_enabled() ? Actually this call can't be used, and if early code used it prior to setup_arch() it'd be a bug as its only properly set until later. Vetting for correctness of all code call is still required though and perhaps we do need something to catch now this PV type on early code such as this one if we don't want it. From what I've gathered before on other bsp ucode we don't want ucode loaded for PV guest types through these mechanisms. Luis
On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote: > On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote: > > > > On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> > wrote: > > > You go: > > > > > > hvmlite_start_xen() --> > > > HVM stub > > > startup_64() | (startup_32() > > > > Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to > > rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not > > be legacy or crap / old, so we'd need a way to catch it if we should > > not use that code for this PV type. This begs the question, are you > > also sure other callers in startup_32() or startup_64() might be OK as > > well where previously guarded with pv_enabled() ? > > Actually this call can't be used, and if early code used it prior to > setup_arch() it'd be a bug as its only properly set until later. Vetting > for correctness of all code call is still required though and perhaps we do > need something to catch now this PV type on early code such as this one if > we don't want it. From what I've gathered before on other bsp ucode we > don't want ucode loaded for PV guest types through these mechanisms. It may help to not think of PVH/hvmlite as PV. It really is HVM with a lot of emulated devices removed. How does early microcode work on EFI? Does the EFI stub code have an early microcode loading code ? > > Luis > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote: > On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote: >> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote: >>> >>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> >> wrote: >>>> You go: >>>> >>>> hvmlite_start_xen() --> >>>> HVM stub >>>> startup_64() | (startup_32() >>> >>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to >>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not >>> be legacy or crap / old, so we'd need a way to catch it if we should >>> not use that code for this PV type. This begs the question, are you >>> also sure other callers in startup_32() or startup_64() might be OK as >>> well where previously guarded with pv_enabled() ? >> >> Actually this call can't be used, and if early code used it prior to >> setup_arch() it'd be a bug as its only properly set until later. Vetting >> for correctness of all code call is still required though and perhaps we do >> need something to catch now this PV type on early code such as this one if >> we don't want it. From what I've gathered before on other bsp ucode we >> don't want ucode loaded for PV guest types through these mechanisms. > > It may help to not think of PVH/hvmlite as PV. It really is HVM with a lot > of emulated devices removed. > > How does early microcode work on EFI? Does the EFI stub code have an early > microcode loading code ? Surely the interesting comparison here is how is (early) microcode loading disabled in KVM guests? We should use the same mechanism for HVMlite guests. David
On 01/27/2016 09:50 AM, David Vrabel wrote: > On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote: >> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote: >>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote: >>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> >>> wrote: >>>>> You go: >>>>> >>>>> hvmlite_start_xen() --> >>>>> HVM stub >>>>> startup_64() | (startup_32() >>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to >>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not >>>> be legacy or crap / old, so we'd need a way to catch it if we should >>>> not use that code for this PV type. This begs the question, are you >>>> also sure other callers in startup_32() or startup_64() might be OK as >>>> well where previously guarded with pv_enabled() ? >>> Actually this call can't be used, and if early code used it prior to >>> setup_arch() it'd be a bug as its only properly set until later. Vetting >>> for correctness of all code call is still required though and perhaps we do >>> need something to catch now this PV type on early code such as this one if >>> we don't want it. From what I've gathered before on other bsp ucode we >>> don't want ucode loaded for PV guest types through these mechanisms. >> It may help to not think of PVH/hvmlite as PV. It really is HVM with a lot >> of emulated devices removed. >> >> How does early microcode work on EFI? Does the EFI stub code have an early >> microcode loading code ? > Surely the interesting comparison here is how is (early) microcode > loading disabled in KVM guests? We should use the same mechanism for > HVMlite guests. Why would we ever want to have a guest load microcode during boot? I can see how a (privileged) guest may want to load microcode from a shell (via microcode driver). -boris
On 27/01/16 15:06, Boris Ostrovsky wrote: > On 01/27/2016 09:50 AM, David Vrabel wrote: >> On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote: >>> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote: >>>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote: >>>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> >>>> wrote: >>>>>> You go: >>>>>> >>>>>> hvmlite_start_xen() --> >>>>>> HVM stub >>>>>> startup_64() | (startup_32() >>>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to >>>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not >>>>> be legacy or crap / old, so we'd need a way to catch it if we should >>>>> not use that code for this PV type. This begs the question, are you >>>>> also sure other callers in startup_32() or startup_64() might be OK as >>>>> well where previously guarded with pv_enabled() ? >>>> Actually this call can't be used, and if early code used it prior to >>>> setup_arch() it'd be a bug as its only properly set until later. >>>> Vetting >>>> for correctness of all code call is still required though and >>>> perhaps we do >>>> need something to catch now this PV type on early code such as this >>>> one if >>>> we don't want it. From what I've gathered before on other bsp ucode we >>>> don't want ucode loaded for PV guest types through these mechanisms. >>> It may help to not think of PVH/hvmlite as PV. It really is HVM with >>> a lot >>> of emulated devices removed. >>> >>> How does early microcode work on EFI? Does the EFI stub code have an >>> early >>> microcode loading code ? >> Surely the interesting comparison here is how is (early) microcode >> loading disabled in KVM guests? We should use the same mechanism for ^^^^^^^^ >> HVMlite guests. > > > Why would we ever want to have a guest load microcode during boot? I can > see how a (privileged) guest may want to load microcode from a shell > (via microcode driver). I think you missed a word when you read my reply. David
On 01/27/2016 10:09 AM, David Vrabel wrote: > On 27/01/16 15:06, Boris Ostrovsky wrote: >> On 01/27/2016 09:50 AM, David Vrabel wrote: >>> On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote: >>>> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote: >>>>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote: >>>>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> >>>>> wrote: >>>>>>> You go: >>>>>>> >>>>>>> hvmlite_start_xen() --> >>>>>>> HVM stub >>>>>>> startup_64() | (startup_32() >>>>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to >>>>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not >>>>>> be legacy or crap / old, so we'd need a way to catch it if we should >>>>>> not use that code for this PV type. This begs the question, are you >>>>>> also sure other callers in startup_32() or startup_64() might be OK as >>>>>> well where previously guarded with pv_enabled() ? >>>>> Actually this call can't be used, and if early code used it prior to >>>>> setup_arch() it'd be a bug as its only properly set until later. >>>>> Vetting >>>>> for correctness of all code call is still required though and >>>>> perhaps we do >>>>> need something to catch now this PV type on early code such as this >>>>> one if >>>>> we don't want it. From what I've gathered before on other bsp ucode we >>>>> don't want ucode loaded for PV guest types through these mechanisms. >>>> It may help to not think of PVH/hvmlite as PV. It really is HVM with >>>> a lot >>>> of emulated devices removed. >>>> >>>> How does early microcode work on EFI? Does the EFI stub code have an >>>> early >>>> microcode loading code ? >>> Surely the interesting comparison here is how is (early) microcode >>> loading disabled in KVM guests? We should use the same mechanism for > ^^^^^^^^ >>> HVMlite guests. >> >> Why would we ever want to have a guest load microcode during boot? I can >> see how a (privileged) guest may want to load microcode from a shell >> (via microcode driver). > I think you missed a word when you read my reply. Yes, I missed it ;-) Why not continue relying on paravirt_enabled()? We are going to keep this in some form for HVMlite. -boris
On Wed, Jan 27, 2016 at 10:17:56AM -0500, Boris Ostrovsky wrote: > On 01/27/2016 10:09 AM, David Vrabel wrote: > >On 27/01/16 15:06, Boris Ostrovsky wrote: > >>On 01/27/2016 09:50 AM, David Vrabel wrote: > >>>On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote: > >>>>On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote: > >>>>>On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote: > >>>>>>On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> > >>>>>wrote: > >>>>>>>You go: > >>>>>>> > >>>>>>>hvmlite_start_xen() --> > >>>>>>> HVM stub > >>>>>>> startup_64() | (startup_32() > >>>>>>Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to > >>>>>>rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not > >>>>>>be legacy or crap / old, so we'd need a way to catch it if we should > >>>>>>not use that code for this PV type. This begs the question, are you > >>>>>>also sure other callers in startup_32() or startup_64() might be OK as > >>>>>>well where previously guarded with pv_enabled() ? > >>>>>Actually this call can't be used, and if early code used it prior to > >>>>>setup_arch() it'd be a bug as its only properly set until later. > >>>>>Vetting > >>>>>for correctness of all code call is still required though and > >>>>>perhaps we do > >>>>>need something to catch now this PV type on early code such as this > >>>>>one if > >>>>>we don't want it. From what I've gathered before on other bsp ucode we > >>>>>don't want ucode loaded for PV guest types through these mechanisms. > >>>>It may help to not think of PVH/hvmlite as PV. It really is HVM with > >>>>a lot > >>>>of emulated devices removed. > >>>> > >>>>How does early microcode work on EFI? Does the EFI stub code have an > >>>>early > >>>>microcode loading code ? > >>>Surely the interesting comparison here is how is (early) microcode > >>>loading disabled in KVM guests? We should use the same mechanism for > > ^^^^^^^^ > >>>HVMlite guests. > >> > >>Why would we ever want to have a guest load microcode during boot? I can > >>see how a (privileged) guest may want to load microcode from a shell > >>(via microcode driver). > >I think you missed a word when you read my reply. > > Yes, I missed it ;-) > > Why not continue relying on paravirt_enabled()? We are going to keep this in > some form for HVMlite. And this is where Luis comes in. He has posted an patchset which removes the paravirt_enabled with .. Here is the link https://lkml.org/lkml/2015/12/15/772
On Wed, Jan 27, 2016 at 02:50:36PM +0000, David Vrabel wrote: > Surely the interesting comparison here is how is (early) microcode > loading disabled in KVM guests? It isn't - kvm simply ignores the write to the microcode application MSRs MSR_AMD64_PATCH_LOADER and MSR_IA32_UCODE_REV, respectively. > We should use the same mechanism for HVMlite guests. Good idea.
On 01/27/2016 10:29 AM, Konrad Rzeszutek Wilk wrote: > On Wed, Jan 27, 2016 at 10:17:56AM -0500, Boris Ostrovsky wrote: >> On 01/27/2016 10:09 AM, David Vrabel wrote: >>> On 27/01/16 15:06, Boris Ostrovsky wrote: >>>> On 01/27/2016 09:50 AM, David Vrabel wrote: >>>>> On 27/01/16 14:42, Konrad Rzeszutek Wilk wrote: >>>>>> On Tue, Jan 26, 2016 at 08:54:56PM -0800, Luis R. Rodriguez wrote: >>>>>>> On Jan 26, 2016 6:16 PM, "Luis R. Rodriguez" <mcgrof@suse.com> wrote: >>>>>>>> On Tue, Jan 26, 2016 at 4:04 PM, Luis R. Rodriguez <mcgrof@suse.com> >>>>>>> wrote: >>>>>>>>> You go: >>>>>>>>> >>>>>>>>> hvmlite_start_xen() --> >>>>>>>>> HVM stub >>>>>>>>> startup_64() | (startup_32() >>>>>>>> Hrm, does HVMlite work well with load_ucode_bsp(), note the patches to >>>>>>>> rebrand pv_enabled() to pv_legacy() or whatever, this PV type will not >>>>>>>> be legacy or crap / old, so we'd need a way to catch it if we should >>>>>>>> not use that code for this PV type. This begs the question, are you >>>>>>>> also sure other callers in startup_32() or startup_64() might be OK as >>>>>>>> well where previously guarded with pv_enabled() ? >>>>>>> Actually this call can't be used, and if early code used it prior to >>>>>>> setup_arch() it'd be a bug as its only properly set until later. >>>>>>> Vetting >>>>>>> for correctness of all code call is still required though and >>>>>>> perhaps we do >>>>>>> need something to catch now this PV type on early code such as this >>>>>>> one if >>>>>>> we don't want it. From what I've gathered before on other bsp ucode we >>>>>>> don't want ucode loaded for PV guest types through these mechanisms. >>>>>> It may help to not think of PVH/hvmlite as PV. It really is HVM with >>>>>> a lot >>>>>> of emulated devices removed. >>>>>> >>>>>> How does early microcode work on EFI? Does the EFI stub code have an >>>>>> early >>>>>> microcode loading code ? >>>>> Surely the interesting comparison here is how is (early) microcode >>>>> loading disabled in KVM guests? We should use the same mechanism for >>> ^^^^^^^^ >>>>> HVMlite guests. >>>> Why would we ever want to have a guest load microcode during boot? I can >>>> see how a (privileged) guest may want to load microcode from a shell >>>> (via microcode driver). >>> I think you missed a word when you read my reply. >> Yes, I missed it ;-) >> >> Why not continue relying on paravirt_enabled()? We are going to keep this in >> some form for HVMlite. > And this is where Luis comes in. He has posted an patchset which removes the > paravirt_enabled with .. Here is the link https://lkml.org/lkml/2015/12/15/772 Yes, I saw that and this will be renamed as paravirt_legacy() (which I am not sure is really what it should be called.) Another option is to have early microcode code query CPUID to see whether we are running on a hypervisor (this in fact is what we originally thought of doing before realizing that we have paravirt_enabled()). But then how is HVMlite different from a regular HVM guest trying to load microcode? (BTW, to answer David's question about what KVM is doing --- it is ignoring writes to microcode MSRs, see kvm_set_msr_common().) -boris
Bleh moving forward please use mcgrof@kernel.org, that will be sent to my employer and my personal address. More below. On Wed, Jan 27, 2016 at 8:15 AM, Boris Ostrovsky <boris.ostrovsky@oracle.com> wrote: > On 01/27/2016 10:29 AM, Konrad Rzeszutek Wilk wrote: >> On Wed, Jan 27, 2016 at 10:17:56AM -0500, Boris Ostrovsky wrote: >>> On 01/27/2016 10:09 AM, David Vrabel wrote: >>>> On 27/01/16 15:06, Boris Ostrovsky wrote: >>> Why not continue relying on paravirt_enabled()? We are going to keep this >>> in >>> some form for HVMlite. >> >> And this is where Luis comes in. He has posted an patchset which removes >> the >> paravirt_enabled with .. Here is the link >> https://lkml.org/lkml/2015/12/15/772 > > > Yes, I saw that and this will be renamed as paravirt_legacy() (which I am > not sure is really what it should be called.) Given Konrad's pointers about some folks pushing for BIOS to have a "legacy free option" where all legacy crap (PS/2, PnP, serial port, parallel port) are all disabled [0], I'm inclined to respin the rename patch to use x86_legacy_free(). This can later then be extended provided such BIOS check becomes available. [0] http://lkml.kernel.org/r/20160120193241.GA4769@char.us.oracle.com But -- as I also pointed out to hpa recently, there is an issue with using things like cpu_has_hypervisor() (or in this case paravirt_enabled()) for early code, given that it relies on init_hypervisor_platform() having been called first which is called on setup_arch() [1]. So paravirt_enabled() really can only be used prior to setup_arch() correctly (specifically after, init_hypervisor_platform()), and anything else would be a bug. I learned the hard way while doing some linker table work and trying to find a good use / solution to the dead code concerns I've been aiming to address due to Xen's separate entry point and the nature of pvops [2] [3]. hpa's response to this issue was that we cannot and should not abuse the boot_params hardware_subarch for this purpose (in this case I was suggesting perhaps KVM could use it if it in the future needed it for a kvm check earlier than setup_arch()), but he noted that: "If you have a genuine need for a "hypervisor type" then that is a separate thing and should be treated separately from subarch. However, you need to consider that some hypervisors can emulate other hypervisors and you may have more than one hypervisor API available." This goes along with my suggestion here earlier where I mentioned that subarch is already used nicely to pivot off some specific subarch entry after startup_32(), if we want to avoid yet-another-entry for HVMlite (the PVH rebrand done cleanly) and still use startup_32() for it we could perhaps follow similar strategy as with subarch but instead add a hypervisor type and use that for a stub call the hypervisor type if needed early on in startup_32(). This could perhaps in turn also be used as a generic hypervisor type / and more robust / easier / not-so-convoluted check. For instance of what I think is a convoluted check refer to snd_intel8x0_inside_vm() in sound/pci/intel8x0.c with its use of kvm_para_available(), #ifdefery over boot_cpu_has(X86_FEATURE_HYPERVISOR) (which is just cpu_has_hypervisor()). If we had a hypervisor type easily accessible it could both be used by early init code and perhaps driver code to replace these convoluted checks. If this seems a bit sensible the next question to ask if -- how we'd *set* the hypervisor type, do we extend the x86 boot protocol and enable a hypervisor type on the zero page? That would clearly only make sense if: 1) Its reasonable to x86 maintainers (perhaps pending this discussion's outcome? If its preemptively known to not reasonable an alternative suggestion would be appreciated) 2) Xen is willing to actually set this (and perhaps a new hypervisor_type_data for the private data structure), but not much more would be needed, and incorporate this as part of the DmLite protocol, or whatever its called as an option for some OSes 3) The kernel could get access to this value from the zero page really early on, immediately following startup_32() Worth mentioning here also is hpa's clarification on when subarch type PC (0) should be used: [it should be used if the hardware is] "enumerable using standard PC mechanisms (PCI, ACPI) and doesn't need a special boot flow" -- does that fit HVMLite's description so far? If so then The Xen subarch may need to be redefined as well to be clear what it means. I don't think we need to be precise but at the very least cover grounds to enable the definitions to meet its actual use to not confuse users. [1] http://lkml.kernel.org/r/CAB=NE6WoKsP8+KGnJEtigWYktCMjg6iherCOcq-jskxi4P2QqA@mail.gmail.com [2] http://www.do-not-panic.com/2015/12/avoiding-dead-code-pvops-not-silver-bullet.html [3] http://www.do-not-panic.com/2015/12/xen-and-x86-linux-zero-page.html [4] http://lkml.kernel.org/r/56A17B01.9040708@zytor.com > Another option is to have early microcode code query CPUID to see whether we > are running on a hypervisor (this in fact is what we originally thought of > doing before realizing that we have paravirt_enabled()). Please keep in mind we want to consider use and setting of something generic, not just a solution for Xen, and if we want to use this to help avoid a new entry point then strive for something we actually could get access to very early on the first x86 entry point, both for 32-bit and 64-bit. If its accessible for both 32-bit and 64-bit well hell, we could unify not only the entry points for HVMLite thing, but I'm pretty sure also unify the entry points for PV and x86 as well (if we wanted that) ! Luis
On Wed, Jan 27, 2016 at 10:48 AM, Luis R. Rodriguez <mcgrof@kernel.org> wrote: > > Worth mentioning here also is hpa's clarification on when subarch type > PC (0) should be used: [it should be used if the hardware is] > "enumerable using standard PC mechanisms (PCI, ACPI) and doesn't need > a special boot flow" -- does that fit HVMLite's description so far? If > so then The Xen subarch may need to be redefined as well to be clear > what it means. I don't think we need to be precise but at the very > least cover grounds to enable the definitions to meet its actual use > to not confuse users. Another thing to consider for HVMlite is that if the 0 subarch (PC) is used in light of my linker table work and x86's use of it with the subarch and supported subarch bitmask, is that it would also mean HVMLite would run all routines currently pegged as needing PC type (the current KVM and bare metal path) and it would mean not running anything only pegged with Xen subarch type (but note that today Xen doesn't even set the subarch type). If there is nothing in common between PV and HVMlite (no x86 init calls to share), and if HVMLite *can* call *alllllllll* PC init calls, then by all means this is fine, and if we just need to distinguish stuff between PC types that's fine, it may still be possible to further extend hypervisor_type to the x86 init calls I'm adding as another supported_hyper_types to ensure even though a subarch is being used, that we also check the supported hypervisor type as well. Luis
On 01/27/2016 02:00 PM, Luis R. Rodriguez wrote: > On Wed, Jan 27, 2016 at 10:48 AM, Luis R. Rodriguez <mcgrof@kernel.org> wrote: >> Worth mentioning here also is hpa's clarification on when subarch type >> PC (0) should be used: [it should be used if the hardware is] >> "enumerable using standard PC mechanisms (PCI, ACPI) and doesn't need >> a special boot flow" -- does that fit HVMLite's description so far? If >> so then The Xen subarch may need to be redefined as well to be clear >> what it means. I don't think we need to be precise but at the very >> least cover grounds to enable the definitions to meet its actual use >> to not confuse users. > Another thing to consider for HVMlite is that if the 0 subarch (PC) is > used in light of my linker table work and x86's use of it with the > subarch and supported subarch bitmask, is that it would also mean > HVMLite would run all routines currently pegged as needing PC type > (the current KVM and bare metal path) and it would mean not running > anything only pegged with Xen subarch type (but note that today Xen > doesn't even set the subarch type). If there is nothing in common > between PV and HVMlite (no x86 init calls to share), and if HVMLite > *can* call *alllllllll* PC init calls, then by all means this is fine, Yes, that's the idea. HVMlite jumps to startup_32|64 from the stub and runs from there with subarch 0. -boris > and if we just need to distinguish stuff between PC types that's fine, > it may still be possible to further extend hypervisor_type to the x86 > init calls I'm adding as another supported_hyper_types to ensure even > though a subarch is being used, that we also check the supported > hypervisor type as well. > > Luis
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index e47e527..1d913d7 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o obj-$(CONFIG_XEN_EFI) += efi.o +obj-$(CONFIG_XEN_PVHVM) += xen-hvmlite.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2cf446a..2ed8b2b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -118,7 +118,8 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); */ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); -enum xen_domain_type xen_domain_type = XEN_NATIVE; +enum xen_domain_type xen_domain_type + __attribute__((section(".data"))) = XEN_NATIVE; EXPORT_SYMBOL_GPL(xen_domain_type); unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; @@ -171,6 +172,17 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +#ifdef CONFIG_XEN_PVHVM +/* + * HVMlite variables. These need to live in data segment since they are + * initialized before startup_{32|64}, which clear .bss, are invoked. + */ +int xen_hvmlite __attribute__((section(".data"))) = 0; +struct hvm_start_info hvmlite_start_info __attribute__((section(".data"))); +uint hvmlite_start_info_sz = sizeof(hvmlite_start_info); +struct boot_params xen_hvmlite_boot_params __attribute__((section(".data"))); +#endif + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -1736,6 +1748,83 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif } +#ifdef CONFIG_XEN_PVHVM +static void __init hvmlite_bootparams(void) +{ + struct xen_memory_map memmap; + int i; + + memset(&xen_hvmlite_boot_params, 0, sizeof(xen_hvmlite_boot_params)); + + memmap.nr_entries = ARRAY_SIZE(xen_hvmlite_boot_params.e820_map); + set_xen_guest_handle(memmap.buffer, xen_hvmlite_boot_params.e820_map); + if (HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) { + xen_raw_console_write("XENMEM_memory_map failed\n"); + BUG(); + } + + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].addr = + ISA_START_ADDRESS; + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].size = + ISA_END_ADDRESS - ISA_START_ADDRESS; + xen_hvmlite_boot_params.e820_map[memmap.nr_entries++].type = + E820_RESERVED; + + sanitize_e820_map(xen_hvmlite_boot_params.e820_map, + ARRAY_SIZE(xen_hvmlite_boot_params.e820_map), + &memmap.nr_entries); + + xen_hvmlite_boot_params.e820_entries = memmap.nr_entries; + for (i = 0; i < xen_hvmlite_boot_params.e820_entries; i++) + e820_add_region(xen_hvmlite_boot_params.e820_map[i].addr, + xen_hvmlite_boot_params.e820_map[i].size, + xen_hvmlite_boot_params.e820_map[i].type); + + xen_hvmlite_boot_params.hdr.cmd_line_ptr = + hvmlite_start_info.cmdline_paddr; + + /* The first module is always ramdisk */ + if (hvmlite_start_info.nr_modules) { + struct hvm_modlist_entry *modaddr = + __va(hvmlite_start_info.modlist_paddr); + xen_hvmlite_boot_params.hdr.ramdisk_image = modaddr->paddr; + xen_hvmlite_boot_params.hdr.ramdisk_size = modaddr->size; + } + + /* + * See Documentation/x86/boot.txt. + * + * Version 2.12 supports Xen entry point but we will use default x86/PC + * environment (i.e. hardware_subarch 0). + */ + xen_hvmlite_boot_params.hdr.version = 0x212; + xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */ +} + +/* + * This routine (and those that it might call) should not use + * anything that lives in .bss since that segment will be cleared later + */ +void __init xen_prepare_hvmlite(void) +{ + u32 eax, ecx, edx, msr; + u64 pfn; + + cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + pv_info.name = "Xen HVMlite"; + xen_domain_type = XEN_HVM_DOMAIN; + xen_hvmlite = 1; + + x86_init.oem.arch_setup = xen_init_kernel; + x86_init.oem.banner = xen_banner; + + hvmlite_bootparams(); +} +#endif + void __ref xen_hvm_init_shared_info(void) { int cpu; diff --git a/arch/x86/xen/xen-hvmlite.S b/arch/x86/xen/xen-hvmlite.S new file mode 100644 index 0000000..90f03d0 --- /dev/null +++ b/arch/x86/xen/xen-hvmlite.S @@ -0,0 +1,158 @@ +/* + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + .code32 + .text +#define _pa(x) ((x) - __START_KERNEL_map) + +#include <linux/elfnote.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/asm.h> +#include <asm/boot.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <xen/interface/elfnote.h> + + __HEAD + .code32 + +/* Entry point for HVMlite guests */ +ENTRY(hvmlite_start_xen) + cli + cld + + mov $_pa(gdt), %eax + lgdt (%eax) + + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%ss + + /* Stash hvm_start_info */ + mov $_pa(hvmlite_start_info), %edi + mov %ebx, %esi + mov $_pa(hvmlite_start_info_sz), %ecx + mov (%ecx), %ecx + rep + movsb + + movl $_pa(early_stack_end), %eax + movl %eax, %esp + + /* Enable PAE mode */ + movl %cr4, %eax + orl $X86_CR4_PAE, %eax + movl %eax, %cr4 + +#ifdef CONFIG_X86_64 + /* Enable Long mode */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Enable pre-constructed page tables */ + mov $_pa(init_level4_pgt), %eax + movl %eax, %cr3 + movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + /* Jump to 64-bit mode. */ + pushl $__KERNEL_CS + leal _pa(1f), %eax + pushl %eax + lret + + /* 64-bit entry point */ + .code64 +1: + call xen_prepare_hvmlite + + /* startup_64 expects boot_params in %rsi */ + mov $_pa(xen_hvmlite_boot_params), %rsi + movq $_pa(startup_64), %rax + jmp *%rax + +#else /* CONFIG_X86_64 */ + + /* Use initial_page table and set level 2 to map 2M pages */ + movl $_pa(initial_pg_pmd), %edi + movl $(_PAGE_PSE | _PAGE_RW | _PAGE_PRESENT), %eax + movl $2048, %ecx +2: + movl %eax, 0(%edi) + addl $0x00200000, %eax + addl $8, %edi + decl %ecx + jnz 2b + + /* Enable the boot paging */ + movl $_pa(initial_page_table), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + ljmp $__BOOT_CS,$3f +3: + call xen_prepare_hvmlite + mov $_pa(xen_hvmlite_boot_params), %esi + + /* startup_32 doesn't expect paging and PAE to be on */ + ljmp $__BOOT_CS,$_pa(4f) +4: + movl %cr0, %eax + andl $~X86_CR0_PG, %eax + movl %eax, %cr0 + movl %cr4, %eax + andl $~X86_CR4_PAE, %eax + movl %eax, %cr4 + + /* Restore initial_pg_pmd to its original (zero) state */ + movl $_pa(initial_pg_pmd), %edi + xorl %eax, %eax + movl $(PAGE_SIZE/4), %ecx + rep stosl + + ljmp $0x10, $_pa(startup_32) +#endif + + .data +gdt: + .word gdt_end - gdt + .long _pa(gdt) + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ +#ifdef CONFIG_X86_64 + .quad 0x00af9a000000ffff /* __KERNEL_CS */ +#else + .quad 0x00cf9a000000ffff /* __KERNEL_CS */ +#endif + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +gdt_end: + + .bss + .balign 4 +early_stack: + .fill 16, 1, 0 +early_stack_end: + + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, + _ASM_PTR (hvmlite_start_xen - __START_KERNEL_map)) diff --git a/include/xen/xen.h b/include/xen/xen.h index 0c0e3ef..6a0d3f3 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -29,6 +29,12 @@ extern enum xen_domain_type xen_domain_type; #define xen_initial_domain() (0) #endif /* CONFIG_XEN_DOM0 */ +#ifdef CONFIG_XEN_PVHVM +extern int xen_hvmlite; +#else +#define xen_hvmlite (0) +#endif + #ifdef CONFIG_XEN_PVH /* This functionality exists only for x86. The XEN_PVHVM support exists * only in x86 world - hence on ARM it will be always disabled.
Start HVMlite guest XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall page, initialize boot_params, enable early page tables. Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> --- arch/x86/xen/Makefile | 1 + arch/x86/xen/enlighten.c | 91 +++++++++++++++++++++++++- arch/x86/xen/xen-hvmlite.S | 158 ++++++++++++++++++++++++++++++++++++++++++++ include/xen/xen.h | 6 ++ 4 files changed, 255 insertions(+), 1 deletions(-) create mode 100644 arch/x86/xen/xen-hvmlite.S