Message ID | 1511897682-32060-1-git-send-email-maran.wilson@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 28/11/17 19:34, Maran Wilson wrote: > For certain applications it is desirable to rapidly boot a KVM virtual > machine. In cases where legacy hardware and software support within the > guest is not needed, Qemu should be able to boot directly into the > uncompressed Linux kernel binary without the need to run firmware. > > There already exists an ABI to allow this for Xen PVH guests and the ABI is > supported by Linux and FreeBSD: > > https://xenbits.xen.org/docs/unstable/misc/hvmlite.html Just FYI, this link has recently become stale, following some cleanup. The document is now: https://xenbits.xen.org/docs/unstable/misc/pvh.html ~Andrew > > This PoC patch enables Qemu to use that same entry point for booting KVM > guests. > > Even though the code is still PoC quality, I'm sending this as an RFC now > since there are a number of different ways the specific implementation > details can be handled. I chose a shared code path for Xen and KVM guests > but could just as easily create a separate code path that is advertised by > a different ELF note for KVM. There also seems to be some flexibility in > how the e820 table data is passed and how (or if) it should be identified > as e820 data. As a starting point, I've chosen the options that seem to > result in the smallest patch with minimal to no changes required of the > x86/HVM direct boot ABI. > --- > arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------ > 1 file changed, 55 insertions(+), 19 deletions(-) > > diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c > index 98ab176..d93f711 100644 > --- a/arch/x86/xen/enlighten_pvh.c > +++ b/arch/x86/xen/enlighten_pvh.c > @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void) > acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; > } > > -static void __init init_pvh_bootparams(void) > +static void __init init_pvh_bootparams(bool xen_guest) > { > struct xen_memory_map memmap; > int rc; > > memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); > > - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); > - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); > - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); > - if (rc) { > - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); > - BUG(); > + if (xen_guest) { > + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); > + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); > + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); > + if (rc) { > + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); > + BUG(); > + } > + pvh_bootparams.e820_entries = memmap.nr_entries; > + } else if (pvh_start_info.nr_modules > 1) { > + /* The second module should be the e820 data for KVM guests */ > + struct hvm_modlist_entry *modaddr; > + char e820_sig[] = "e820 data"; > + struct boot_e820_entry *ep; > + struct e820_table *tp; > + char *cmdline_str; > + int idx; > + > + modaddr = __va(pvh_start_info.modlist_paddr + > + sizeof(struct hvm_modlist_entry)); > + cmdline_str = __va(modaddr->cmdline_paddr); > + > + if ((modaddr->cmdline_paddr) && > + (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) { > + tp = __va(modaddr->paddr); > + ep = (struct boot_e820_entry *)tp->entries; > + > + pvh_bootparams.e820_entries = tp->nr_entries; > + > + for (idx = 0; idx < tp->nr_entries ; idx++, ep++) > + pvh_bootparams.e820_table[idx] = *ep; > + } > } > - pvh_bootparams.e820_entries = memmap.nr_entries; > > if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { > pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr = > @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void) > pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type = > E820_TYPE_RESERVED; > pvh_bootparams.e820_entries++; > - } else > + } else if (xen_guest) { > xen_raw_printk("Warning: Can fit ISA range into e820\n"); > + } > > pvh_bootparams.hdr.cmd_line_ptr = > pvh_start_info.cmdline_paddr; > @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void) > * environment (i.e. hardware_subarch 0). > */ > pvh_bootparams.hdr.version = 0x212; > - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ > + pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0; > } > > /* > @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void) > */ > void __init xen_prepare_pvh(void) > { > - u32 msr; > + > + u32 msr = xen_cpuid_base(); > u64 pfn; > + bool xen_guest = msr ? true : false; > > if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { > - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", > - pvh_start_info.magic); > + if (xen_guest) > + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", > + pvh_start_info.magic); > BUG(); > } > > - xen_pvh = 1; > + if (xen_guest) { > + xen_pvh = 1; > + > + msr = cpuid_ebx(msr + 2); > + pfn = __pa(hypercall_page); > + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); > + > + } else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) { > + BUG(); > + } > > - msr = cpuid_ebx(xen_cpuid_base() + 2); > - pfn = __pa(hypercall_page); > - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); > + init_pvh_bootparams(xen_guest); > > - init_pvh_bootparams(); > + if (xen_guest) > + x86_init.oem.arch_setup = xen_pvh_arch_setup; > > - x86_init.oem.arch_setup = xen_pvh_arch_setup; > }
On Tue, Nov 28, 2017 at 11:34:42AM -0800, Maran Wilson wrote: > This PoC patch enables Qemu to use that same entry point for booting KVM > guests. Nice. I do a a lot of -kernel boots in qemu/kvm for testing, and speeding this further up would be great.
On 28/11/17 20:34, Maran Wilson wrote: > For certain applications it is desirable to rapidly boot a KVM virtual > machine. In cases where legacy hardware and software support within the > guest is not needed, Qemu should be able to boot directly into the > uncompressed Linux kernel binary without the need to run firmware. > > There already exists an ABI to allow this for Xen PVH guests and the ABI is > supported by Linux and FreeBSD: > > https://xenbits.xen.org/docs/unstable/misc/hvmlite.html > > This PoC patch enables Qemu to use that same entry point for booting KVM > guests. > > Even though the code is still PoC quality, I'm sending this as an RFC now > since there are a number of different ways the specific implementation > details can be handled. I chose a shared code path for Xen and KVM guests > but could just as easily create a separate code path that is advertised by > a different ELF note for KVM. There also seems to be some flexibility in > how the e820 table data is passed and how (or if) it should be identified > as e820 data. As a starting point, I've chosen the options that seem to > result in the smallest patch with minimal to no changes required of the > x86/HVM direct boot ABI. I like the idea. I'd rather split up the different hypervisor types early and use a common set of service functions instead of special casing xen_guest everywhere. This would make it much easier to support the KVM PVH boot without the need to configure the kernel with CONFIG_XEN. Another option would be to use the same boot path as with grub: set the boot params in zeropage and start at startup_32. Juergen > --- > arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------ > 1 file changed, 55 insertions(+), 19 deletions(-) > > diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c > index 98ab176..d93f711 100644 > --- a/arch/x86/xen/enlighten_pvh.c > +++ b/arch/x86/xen/enlighten_pvh.c > @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void) > acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; > } > > -static void __init init_pvh_bootparams(void) > +static void __init init_pvh_bootparams(bool xen_guest) > { > struct xen_memory_map memmap; > int rc; > > memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); > > - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); > - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); > - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); > - if (rc) { > - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); > - BUG(); > + if (xen_guest) { > + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); > + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); > + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); > + if (rc) { > + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); > + BUG(); > + } > + pvh_bootparams.e820_entries = memmap.nr_entries; > + } else if (pvh_start_info.nr_modules > 1) { > + /* The second module should be the e820 data for KVM guests */ > + struct hvm_modlist_entry *modaddr; > + char e820_sig[] = "e820 data"; > + struct boot_e820_entry *ep; > + struct e820_table *tp; > + char *cmdline_str; > + int idx; > + > + modaddr = __va(pvh_start_info.modlist_paddr + > + sizeof(struct hvm_modlist_entry)); > + cmdline_str = __va(modaddr->cmdline_paddr); > + > + if ((modaddr->cmdline_paddr) && > + (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) { > + tp = __va(modaddr->paddr); > + ep = (struct boot_e820_entry *)tp->entries; > + > + pvh_bootparams.e820_entries = tp->nr_entries; > + > + for (idx = 0; idx < tp->nr_entries ; idx++, ep++) > + pvh_bootparams.e820_table[idx] = *ep; > + } > } > - pvh_bootparams.e820_entries = memmap.nr_entries; > > if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { > pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr = > @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void) > pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type = > E820_TYPE_RESERVED; > pvh_bootparams.e820_entries++; > - } else > + } else if (xen_guest) { > xen_raw_printk("Warning: Can fit ISA range into e820\n"); > + } > > pvh_bootparams.hdr.cmd_line_ptr = > pvh_start_info.cmdline_paddr; > @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void) > * environment (i.e. hardware_subarch 0). > */ > pvh_bootparams.hdr.version = 0x212; > - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ > + pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0; > } > > /* > @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void) > */ > void __init xen_prepare_pvh(void) > { > - u32 msr; > + > + u32 msr = xen_cpuid_base(); > u64 pfn; > + bool xen_guest = msr ? true : false; > > if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { > - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", > - pvh_start_info.magic); > + if (xen_guest) > + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", > + pvh_start_info.magic); > BUG(); > } > > - xen_pvh = 1; > + if (xen_guest) { > + xen_pvh = 1; > + > + msr = cpuid_ebx(msr + 2); > + pfn = __pa(hypercall_page); > + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); > + > + } else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) { > + BUG(); > + } > > - msr = cpuid_ebx(xen_cpuid_base() + 2); > - pfn = __pa(hypercall_page); > - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); > + init_pvh_bootparams(xen_guest); > > - init_pvh_bootparams(); > + if (xen_guest) > + x86_init.oem.arch_setup = xen_pvh_arch_setup; > > - x86_init.oem.arch_setup = xen_pvh_arch_setup; > } >
On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote: > On 28/11/17 20:34, Maran Wilson wrote: > > For certain applications it is desirable to rapidly boot a KVM virtual > > machine. In cases where legacy hardware and software support within the > > guest is not needed, Qemu should be able to boot directly into the > > uncompressed Linux kernel binary without the need to run firmware. > > > > There already exists an ABI to allow this for Xen PVH guests and the ABI is > > supported by Linux and FreeBSD: > > > > https://xenbits.xen.org/docs/unstable/misc/hvmlite.html I would also add a link to: http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info > > This PoC patch enables Qemu to use that same entry point for booting KVM > > guests. > > > > Even though the code is still PoC quality, I'm sending this as an RFC now > > since there are a number of different ways the specific implementation > > details can be handled. I chose a shared code path for Xen and KVM guests > > but could just as easily create a separate code path that is advertised by > > a different ELF note for KVM. There also seems to be some flexibility in > > how the e820 table data is passed and how (or if) it should be identified > > as e820 data. As a starting point, I've chosen the options that seem to > > result in the smallest patch with minimal to no changes required of the > > x86/HVM direct boot ABI. > > I like the idea. > > I'd rather split up the different hypervisor types early and use a > common set of service functions instead of special casing xen_guest > everywhere. This would make it much easier to support the KVM PVH > boot without the need to configure the kernel with CONFIG_XEN. > > Another option would be to use the same boot path as with grub: set > the boot params in zeropage and start at startup_32. I think I prefer this approach since AFAICT it should allow for greater code share with the common boot path. > > Juergen > > > --- > > arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------ > > 1 file changed, 55 insertions(+), 19 deletions(-) > > > > diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c > > index 98ab176..d93f711 100644 > > --- a/arch/x86/xen/enlighten_pvh.c > > +++ b/arch/x86/xen/enlighten_pvh.c > > @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void) > > acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; > > } > > > > -static void __init init_pvh_bootparams(void) > > +static void __init init_pvh_bootparams(bool xen_guest) > > { > > struct xen_memory_map memmap; > > int rc; > > > > memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); > > > > - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); > > - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); > > - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); > > - if (rc) { > > - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); > > - BUG(); > > + if (xen_guest) { > > + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); > > + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); > > + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); > > + if (rc) { > > + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); > > + BUG(); > > + } > > + pvh_bootparams.e820_entries = memmap.nr_entries; > > + } else if (pvh_start_info.nr_modules > 1) { > > + /* The second module should be the e820 data for KVM guests */ I don't think this is desirable. You might want to boot other OSes using this method, and they might want to pass more than one module. IMHO the hvm_start_info structure should be bumped to contain a pointer to the memory map. Note that there's a 'version' field that can be used for that. Even on Xen we might want to pass the memory map in such a way instead of using the hypercall. Thanks, Roger.
On 28/11/2017 20:34, Maran Wilson wrote: > For certain applications it is desirable to rapidly boot a KVM virtual > machine. In cases where legacy hardware and software support within the > guest is not needed, Qemu should be able to boot directly into the > uncompressed Linux kernel binary without the need to run firmware. > > There already exists an ABI to allow this for Xen PVH guests and the ABI is > supported by Linux and FreeBSD: > > https://xenbits.xen.org/docs/unstable/misc/hvmlite.html > > This PoC patch enables Qemu to use that same entry point for booting KVM > guests. Nice! So QEMU would parse the ELF file just like for multiboot, find the ELF note, and then prepare an hvmlite boot info struct instead of the multiboot one? There would then be a new option ROM, very similar to multiboot.S. Thanks, Paolo
On 11/29/2017 03:50 AM, Roger Pau Monné wrote: > On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote: >> On 28/11/17 20:34, Maran Wilson wrote: >>> For certain applications it is desirable to rapidly boot a KVM virtual >>> machine. In cases where legacy hardware and software support within the >>> guest is not needed, Qemu should be able to boot directly into the >>> uncompressed Linux kernel binary without the need to run firmware. >>> >>> There already exists an ABI to allow this for Xen PVH guests and the ABI is >>> supported by Linux and FreeBSD: >>> >>> https://xenbits.xen.org/docs/unstable/misc/hvmlite.html > I would also add a link to: > > http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info > >>> This PoC patch enables Qemu to use that same entry point for booting KVM >>> guests. >>> >>> Even though the code is still PoC quality, I'm sending this as an RFC now >>> since there are a number of different ways the specific implementation >>> details can be handled. I chose a shared code path for Xen and KVM guests >>> but could just as easily create a separate code path that is advertised by >>> a different ELF note for KVM. There also seems to be some flexibility in >>> how the e820 table data is passed and how (or if) it should be identified >>> as e820 data. As a starting point, I've chosen the options that seem to >>> result in the smallest patch with minimal to no changes required of the >>> x86/HVM direct boot ABI. >> I like the idea. >> >> I'd rather split up the different hypervisor types early and use a >> common set of service functions instead of special casing xen_guest >> everywhere. This would make it much easier to support the KVM PVH >> boot without the need to configure the kernel with CONFIG_XEN. >> >> Another option would be to use the same boot path as with grub: set >> the boot params in zeropage and start at startup_32. > I think I prefer this approach since AFAICT it should allow for > greater code share with the common boot path. zeropage is x86/Linux-specific so we'd need some sort of firmware (like grub) between a hypervisor and Linux to convert hvm_start_info to bootparams. -boris
On 29/11/17 15:03, Boris Ostrovsky wrote: > On 11/29/2017 03:50 AM, Roger Pau Monné wrote: >> On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote: >>> On 28/11/17 20:34, Maran Wilson wrote: >>>> For certain applications it is desirable to rapidly boot a KVM virtual >>>> machine. In cases where legacy hardware and software support within the >>>> guest is not needed, Qemu should be able to boot directly into the >>>> uncompressed Linux kernel binary without the need to run firmware. >>>> >>>> There already exists an ABI to allow this for Xen PVH guests and the ABI is >>>> supported by Linux and FreeBSD: >>>> >>>> https://xenbits.xen.org/docs/unstable/misc/hvmlite.html >> I would also add a link to: >> >> http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info >> >>>> This PoC patch enables Qemu to use that same entry point for booting KVM >>>> guests. >>>> >>>> Even though the code is still PoC quality, I'm sending this as an RFC now >>>> since there are a number of different ways the specific implementation >>>> details can be handled. I chose a shared code path for Xen and KVM guests >>>> but could just as easily create a separate code path that is advertised by >>>> a different ELF note for KVM. There also seems to be some flexibility in >>>> how the e820 table data is passed and how (or if) it should be identified >>>> as e820 data. As a starting point, I've chosen the options that seem to >>>> result in the smallest patch with minimal to no changes required of the >>>> x86/HVM direct boot ABI. >>> I like the idea. >>> >>> I'd rather split up the different hypervisor types early and use a >>> common set of service functions instead of special casing xen_guest >>> everywhere. This would make it much easier to support the KVM PVH >>> boot without the need to configure the kernel with CONFIG_XEN. >>> >>> Another option would be to use the same boot path as with grub: set >>> the boot params in zeropage and start at startup_32. >> I think I prefer this approach since AFAICT it should allow for >> greater code share with the common boot path. > > zeropage is x86/Linux-specific so we'd need some sort of firmware (like > grub) between a hypervisor and Linux to convert hvm_start_info to > bootparams. qemu? Juergen
On Wed, Nov 29, 2017 at 03:11:12PM +0100, Juergen Gross wrote: > On 29/11/17 15:03, Boris Ostrovsky wrote: > > On 11/29/2017 03:50 AM, Roger Pau Monné wrote: > >> On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote: > >>> On 28/11/17 20:34, Maran Wilson wrote: > >>>> For certain applications it is desirable to rapidly boot a KVM virtual > >>>> machine. In cases where legacy hardware and software support within the > >>>> guest is not needed, Qemu should be able to boot directly into the > >>>> uncompressed Linux kernel binary without the need to run firmware. > >>>> > >>>> There already exists an ABI to allow this for Xen PVH guests and the ABI is > >>>> supported by Linux and FreeBSD: > >>>> > >>>> https://xenbits.xen.org/docs/unstable/misc/hvmlite.html > >> I would also add a link to: > >> > >> http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info > >> > >>>> This PoC patch enables Qemu to use that same entry point for booting KVM > >>>> guests. > >>>> > >>>> Even though the code is still PoC quality, I'm sending this as an RFC now > >>>> since there are a number of different ways the specific implementation > >>>> details can be handled. I chose a shared code path for Xen and KVM guests > >>>> but could just as easily create a separate code path that is advertised by > >>>> a different ELF note for KVM. There also seems to be some flexibility in > >>>> how the e820 table data is passed and how (or if) it should be identified > >>>> as e820 data. As a starting point, I've chosen the options that seem to > >>>> result in the smallest patch with minimal to no changes required of the > >>>> x86/HVM direct boot ABI. > >>> I like the idea. > >>> > >>> I'd rather split up the different hypervisor types early and use a > >>> common set of service functions instead of special casing xen_guest > >>> everywhere. This would make it much easier to support the KVM PVH > >>> boot without the need to configure the kernel with CONFIG_XEN. > >>> > >>> Another option would be to use the same boot path as with grub: set > >>> the boot params in zeropage and start at startup_32. > >> I think I prefer this approach since AFAICT it should allow for > >> greater code share with the common boot path. > > > > zeropage is x86/Linux-specific so we'd need some sort of firmware (like > > grub) between a hypervisor and Linux to convert hvm_start_info to > > bootparams. > > qemu? But then it won't be using the PVH entry point, and would just use the native one? My understanding was that the PVH shim inside of Linux will prepare a zero-page when booted using the PVH entry point, and then jump into the native boot path. Roger.
On 11/29/2017 09:18 AM, Roger Pau Monné wrote: > On Wed, Nov 29, 2017 at 03:11:12PM +0100, Juergen Gross wrote: >> On 29/11/17 15:03, Boris Ostrovsky wrote: >>> On 11/29/2017 03:50 AM, Roger Pau Monné wrote: >>>> On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote: >>>>> On 28/11/17 20:34, Maran Wilson wrote: >>>>>> For certain applications it is desirable to rapidly boot a KVM virtual >>>>>> machine. In cases where legacy hardware and software support within the >>>>>> guest is not needed, Qemu should be able to boot directly into the >>>>>> uncompressed Linux kernel binary without the need to run firmware. >>>>>> >>>>>> There already exists an ABI to allow this for Xen PVH guests and the ABI is >>>>>> supported by Linux and FreeBSD: >>>>>> >>>>>> https://xenbits.xen.org/docs/unstable/misc/hvmlite.html >>>> I would also add a link to: >>>> >>>> http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info >>>> >>>>>> This PoC patch enables Qemu to use that same entry point for booting KVM >>>>>> guests. >>>>>> >>>>>> Even though the code is still PoC quality, I'm sending this as an RFC now >>>>>> since there are a number of different ways the specific implementation >>>>>> details can be handled. I chose a shared code path for Xen and KVM guests >>>>>> but could just as easily create a separate code path that is advertised by >>>>>> a different ELF note for KVM. There also seems to be some flexibility in >>>>>> how the e820 table data is passed and how (or if) it should be identified >>>>>> as e820 data. As a starting point, I've chosen the options that seem to >>>>>> result in the smallest patch with minimal to no changes required of the >>>>>> x86/HVM direct boot ABI. >>>>> I like the idea. >>>>> >>>>> I'd rather split up the different hypervisor types early and use a >>>>> common set of service functions instead of special casing xen_guest >>>>> everywhere. This would make it much easier to support the KVM PVH >>>>> boot without the need to configure the kernel with CONFIG_XEN. >>>>> >>>>> Another option would be to use the same boot path as with grub: set >>>>> the boot params in zeropage and start at startup_32. >>>> I think I prefer this approach since AFAICT it should allow for >>>> greater code share with the common boot path. >>> zeropage is x86/Linux-specific so we'd need some sort of firmware (like >>> grub) between a hypervisor and Linux to convert hvm_start_info to >>> bootparams. >> qemu? I think KVM folks didn't want to do this. I can't find the thread but I believe it was somewhere during Clear Containers discussion. Paolo? > But then it won't be using the PVH entry point, and would just use the > native one? > > My understanding was that the PVH shim inside of Linux will prepare a > zero-page when booted using the PVH entry point, and then jump into > the native boot path. Right, but that's not what Juergen's second option is. IIUIC with that option Linux starts with zeropage already prepared. No shim in the kernel. -boris
On 29/11/2017 15:25, Boris Ostrovsky wrote: >>>> zeropage is x86/Linux-specific so we'd need some sort of firmware (like >>>> grub) between a hypervisor and Linux to convert hvm_start_info to >>>> bootparams. >>> qemu? > > I think KVM folks didn't want to do this. I can't find the thread but I > believe it was somewhere during Clear Containers discussion. Paolo? QEMU is the right place to parse the ELF file and save it in memory. You would have to teach QEMU to find the Xen note in ELF-format kernels (just like it looks for the multiboot header), and use a different option ROM ("pvhboot.c" for example). However I don't like to bypass the BIOS; for -kernel, KVM starts the guest with an option ROM (linuxboot-dma.c or multiboot.S in QEMU sources) that takes care of boot. In either case, you would have a new option ROM. It could either be very simple and similar to multiboot.S, or it could be larger and do the same task as xen-pvh.S and enlighten_pvh.c (then get the address of startup_32 or startup_64 from FW_CFG_KERNEL_ENTRY and jump there). The ugly part is that the option ROM would have to know more details about what it is going to boot, including for example whether it's 32-bit or 64-bit, so I don't really think it is a good idea. I actually like this patch, except that I'd get the e820 memory map from fw_cfg (see the first part of https://github.com/bonzini/qboot/blob/master/fw_cfg.c, and extract_e820 in https://github.com/bonzini/qboot/blob/master/main.c) instead of the second module. Thanks, Paolo > >> But then it won't be using the PVH entry point, and would just use the >> native one? >> >> My understanding was that the PVH shim inside of Linux will prepare a >> zero-page when booted using the PVH entry point, and then jump into >> the native boot path. > Right, but that's not what Juergen's second option is. IIUIC with that > option Linux starts with zeropage already prepared. No shim in the kernel.
On 29/11/17 15:44, Paolo Bonzini wrote: > On 29/11/2017 15:25, Boris Ostrovsky wrote: >>>>> zeropage is x86/Linux-specific so we'd need some sort of firmware (like >>>>> grub) between a hypervisor and Linux to convert hvm_start_info to >>>>> bootparams. >>>> qemu? >> >> I think KVM folks didn't want to do this. I can't find the thread but I >> believe it was somewhere during Clear Containers discussion. Paolo? > > QEMU is the right place to parse the ELF file and save it in memory. > You would have to teach QEMU to find the Xen note in ELF-format kernels > (just like it looks for the multiboot header), and use a different > option ROM ("pvhboot.c" for example). > > However I don't like to bypass the BIOS; for -kernel, KVM starts the > guest with an option ROM (linuxboot-dma.c or multiboot.S in QEMU > sources) that takes care of boot. > > In either case, you would have a new option ROM. It could either be > very simple and similar to multiboot.S, or it could be larger and do the > same task as xen-pvh.S and enlighten_pvh.c (then get the address of > startup_32 or startup_64 from FW_CFG_KERNEL_ENTRY and jump there). The > ugly part is that the option ROM would have to know more details about > what it is going to boot, including for example whether it's 32-bit or > 64-bit, so I don't really think it is a good idea. As grub2 doesn't have to know, qemu shouldn't have to know either. Juergen
On 29/11/2017 15:47, Juergen Gross wrote: > On 29/11/17 15:44, Paolo Bonzini wrote: >> In either case, you would have a new option ROM. It could either be >> very simple and similar to multiboot.S, or it could be larger and do the >> same task as xen-pvh.S and enlighten_pvh.c (then get the address of >> startup_32 or startup_64 from FW_CFG_KERNEL_ENTRY and jump there). The >> ugly part is that the option ROM would have to know more details about >> what it is going to boot, including for example whether it's 32-bit or >> 64-bit, so I don't really think it is a good idea. > > As grub2 doesn't have to know, qemu shouldn't have to know either. That would be exactly what linuxboot-dma.c does already, but it's slower than PVH because Linux has to uncompress itself. The above thought experiment would make QEMU able to boot a PVH kernel without any changes to Linux. Paolo
On 29/11/17 14:47, Juergen Gross wrote: > On 29/11/17 15:44, Paolo Bonzini wrote: >> On 29/11/2017 15:25, Boris Ostrovsky wrote: >>>>>> zeropage is x86/Linux-specific so we'd need some sort of firmware (like >>>>>> grub) between a hypervisor and Linux to convert hvm_start_info to >>>>>> bootparams. >>>>> qemu? >>> I think KVM folks didn't want to do this. I can't find the thread but I >>> believe it was somewhere during Clear Containers discussion. Paolo? >> QEMU is the right place to parse the ELF file and save it in memory. >> You would have to teach QEMU to find the Xen note in ELF-format kernels >> (just like it looks for the multiboot header), and use a different >> option ROM ("pvhboot.c" for example). >> >> However I don't like to bypass the BIOS; for -kernel, KVM starts the >> guest with an option ROM (linuxboot-dma.c or multiboot.S in QEMU >> sources) that takes care of boot. >> >> In either case, you would have a new option ROM. It could either be >> very simple and similar to multiboot.S, or it could be larger and do the >> same task as xen-pvh.S and enlighten_pvh.c (then get the address of >> startup_32 or startup_64 from FW_CFG_KERNEL_ENTRY and jump there). The >> ugly part is that the option ROM would have to know more details about >> what it is going to boot, including for example whether it's 32-bit or >> 64-bit, so I don't really think it is a good idea. > As grub2 doesn't have to know, qemu shouldn't have to know either. An underlying requirement for this boot protocol was to remove the requirement for a priori knowledge of the eventual mode of the guest, which plagues Xen PV guests. (One way or another, we need to parse the kernel which will end up running to work out how to build the domain for it.) 32bit flat mode is easy to set up, sufficiently large for any reasonable bootstrapping, and provides no restrictions to what the eventual guest wants to do. ~Andrew
On 11/29/2017 12:59 AM, Paolo Bonzini wrote: > On 28/11/2017 20:34, Maran Wilson wrote: >> For certain applications it is desirable to rapidly boot a KVM virtual >> machine. In cases where legacy hardware and software support within the >> guest is not needed, Qemu should be able to boot directly into the >> uncompressed Linux kernel binary without the need to run firmware. >> >> There already exists an ABI to allow this for Xen PVH guests and the ABI is >> supported by Linux and FreeBSD: >> >> https://xenbits.xen.org/docs/unstable/misc/hvmlite.html >> >> This PoC patch enables Qemu to use that same entry point for booting KVM >> guests. > Nice! So QEMU would parse the ELF file just like for multiboot, find > the ELF note, and then prepare an hvmlite boot info struct instead of > the multiboot one? Yes, exactly. > There would then be a new option ROM, very similar > to multiboot.S. That is one option. I guess this gets into a discussion about the QEMU side of the upcoming patches that would follow ... I'm currently just initializing the CPU state in QEMU for testing since there is such minimal (non Linux specific) setup that is required by the ABI. And (borrowing from the Intel clear container patches) that VM setup is only performed when user selects the "nofw" option with the q35 model. But yeah, if folks think it important to move all such machine state initialization out of QEMU and into an option ROM, I can look into coding it up that way for the QEMU patches. Thanks, -Maran > Thanks, > > Paolo
On 29/11/2017 18:14, Maran Wilson wrote: > That is one option. I guess this gets into a discussion about the QEMU > side of the upcoming patches that would follow ... > > I'm currently just initializing the CPU state in QEMU for testing since > there is such minimal (non Linux specific) setup that is required by the > ABI. And (borrowing from the Intel clear container patches) that VM > setup is only performed when user selects the "nofw" option with the q35 > model. But yeah, if folks think it important to move all such machine > state initialization out of QEMU and into an option ROM, I can look into > coding it up that way for the QEMU patches. Yes, please do an option ROM. I'll take care of porting it to qboot. Thanks, Paolo
On 11/29/2017 12:21 AM, Juergen Gross wrote: > On 28/11/17 20:34, Maran Wilson wrote: >> For certain applications it is desirable to rapidly boot a KVM virtual >> machine. In cases where legacy hardware and software support within the >> guest is not needed, Qemu should be able to boot directly into the >> uncompressed Linux kernel binary without the need to run firmware. >> >> There already exists an ABI to allow this for Xen PVH guests and the ABI is >> supported by Linux and FreeBSD: >> >> https://xenbits.xen.org/docs/unstable/misc/hvmlite.html >> >> This PoC patch enables Qemu to use that same entry point for booting KVM >> guests. >> >> Even though the code is still PoC quality, I'm sending this as an RFC now >> since there are a number of different ways the specific implementation >> details can be handled. I chose a shared code path for Xen and KVM guests >> but could just as easily create a separate code path that is advertised by >> a different ELF note for KVM. There also seems to be some flexibility in >> how the e820 table data is passed and how (or if) it should be identified >> as e820 data. As a starting point, I've chosen the options that seem to >> result in the smallest patch with minimal to no changes required of the >> x86/HVM direct boot ABI. > I like the idea. > > I'd rather split up the different hypervisor types early and use a > common set of service functions instead of special casing xen_guest > everywhere. This would make it much easier to support the KVM PVH > boot without the need to configure the kernel with CONFIG_XEN. Thanks for the feedback. I'll try doing something like that as this patch moves from proof of concept to a real proposal. > Another option would be to use the same boot path as with grub: set > the boot params in zeropage and start at startup_32. I think others have already responded about that. The main thing I was trying to avoid, was adding any Linux OS specific initialization (like zeropage) to QEMU. Especially since this PVH entry point already exists in Linux. Thanks, -Maran > > Juergen > >> --- >> arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------ >> 1 file changed, 55 insertions(+), 19 deletions(-) >> >> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c >> index 98ab176..d93f711 100644 >> --- a/arch/x86/xen/enlighten_pvh.c >> +++ b/arch/x86/xen/enlighten_pvh.c >> @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void) >> acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; >> } >> >> -static void __init init_pvh_bootparams(void) >> +static void __init init_pvh_bootparams(bool xen_guest) >> { >> struct xen_memory_map memmap; >> int rc; >> >> memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); >> >> - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); >> - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); >> - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); >> - if (rc) { >> - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); >> - BUG(); >> + if (xen_guest) { >> + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); >> + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); >> + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); >> + if (rc) { >> + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); >> + BUG(); >> + } >> + pvh_bootparams.e820_entries = memmap.nr_entries; >> + } else if (pvh_start_info.nr_modules > 1) { >> + /* The second module should be the e820 data for KVM guests */ >> + struct hvm_modlist_entry *modaddr; >> + char e820_sig[] = "e820 data"; >> + struct boot_e820_entry *ep; >> + struct e820_table *tp; >> + char *cmdline_str; >> + int idx; >> + >> + modaddr = __va(pvh_start_info.modlist_paddr + >> + sizeof(struct hvm_modlist_entry)); >> + cmdline_str = __va(modaddr->cmdline_paddr); >> + >> + if ((modaddr->cmdline_paddr) && >> + (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) { >> + tp = __va(modaddr->paddr); >> + ep = (struct boot_e820_entry *)tp->entries; >> + >> + pvh_bootparams.e820_entries = tp->nr_entries; >> + >> + for (idx = 0; idx < tp->nr_entries ; idx++, ep++) >> + pvh_bootparams.e820_table[idx] = *ep; >> + } >> } >> - pvh_bootparams.e820_entries = memmap.nr_entries; >> >> if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { >> pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr = >> @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void) >> pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type = >> E820_TYPE_RESERVED; >> pvh_bootparams.e820_entries++; >> - } else >> + } else if (xen_guest) { >> xen_raw_printk("Warning: Can fit ISA range into e820\n"); >> + } >> >> pvh_bootparams.hdr.cmd_line_ptr = >> pvh_start_info.cmdline_paddr; >> @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void) >> * environment (i.e. hardware_subarch 0). >> */ >> pvh_bootparams.hdr.version = 0x212; >> - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ >> + pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0; >> } >> >> /* >> @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void) >> */ >> void __init xen_prepare_pvh(void) >> { >> - u32 msr; >> + >> + u32 msr = xen_cpuid_base(); >> u64 pfn; >> + bool xen_guest = msr ? true : false; >> >> if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { >> - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", >> - pvh_start_info.magic); >> + if (xen_guest) >> + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", >> + pvh_start_info.magic); >> BUG(); >> } >> >> - xen_pvh = 1; >> + if (xen_guest) { >> + xen_pvh = 1; >> + >> + msr = cpuid_ebx(msr + 2); >> + pfn = __pa(hypercall_page); >> + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); >> + >> + } else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) { >> + BUG(); >> + } >> >> - msr = cpuid_ebx(xen_cpuid_base() + 2); >> - pfn = __pa(hypercall_page); >> - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); >> + init_pvh_bootparams(xen_guest); >> >> - init_pvh_bootparams(); >> + if (xen_guest) >> + x86_init.oem.arch_setup = xen_pvh_arch_setup; >> >> - x86_init.oem.arch_setup = xen_pvh_arch_setup; >> } >>
On 11/29/2017 6:44 AM, Paolo Bonzini wrote: > I actually like this patch, except that I'd get the e820 memory map from > fw_cfg (see the first part of > https://github.com/bonzini/qboot/blob/master/fw_cfg.c, and extract_e820 > inhttps://github.com/bonzini/qboot/blob/master/main.c) instead of the > second module. Hi Paolo, I want to make sure I understand exactly what you are suggesting... Are you saying the Linux PVH entry code (such as init_pvh_bootparams()) should use the fw_cfg interface to read the e820 memory map data and put it into the zeropage? Basically, keeping the patch very much like it already is, just extracting the e820 data via the fw_cfg interface instead of from the second module of start_info struct? If that is the case, I guess I'm a bit hesitant to throw the QEMU specific fw_cfg interface into the mix on the Linux PVH side when the existing PVH ABI already seems to contain an interface for passing modules/blobs to the guest. But if you feel there is a compelling reason to use the fw_cfg interface here, I'm happy to explore that approach further. Thanks, -Maran
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 98ab176..d93f711 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void) acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; } -static void __init init_pvh_bootparams(void) +static void __init init_pvh_bootparams(bool xen_guest) { struct xen_memory_map memmap; int rc; memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc) { - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); - BUG(); + if (xen_guest) { + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) { + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); + BUG(); + } + pvh_bootparams.e820_entries = memmap.nr_entries; + } else if (pvh_start_info.nr_modules > 1) { + /* The second module should be the e820 data for KVM guests */ + struct hvm_modlist_entry *modaddr; + char e820_sig[] = "e820 data"; + struct boot_e820_entry *ep; + struct e820_table *tp; + char *cmdline_str; + int idx; + + modaddr = __va(pvh_start_info.modlist_paddr + + sizeof(struct hvm_modlist_entry)); + cmdline_str = __va(modaddr->cmdline_paddr); + + if ((modaddr->cmdline_paddr) && + (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) { + tp = __va(modaddr->paddr); + ep = (struct boot_e820_entry *)tp->entries; + + pvh_bootparams.e820_entries = tp->nr_entries; + + for (idx = 0; idx < tp->nr_entries ; idx++, ep++) + pvh_bootparams.e820_table[idx] = *ep; + } } - pvh_bootparams.e820_entries = memmap.nr_entries; if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr = @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void) pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type = E820_TYPE_RESERVED; pvh_bootparams.e820_entries++; - } else + } else if (xen_guest) { xen_raw_printk("Warning: Can fit ISA range into e820\n"); + } pvh_bootparams.hdr.cmd_line_ptr = pvh_start_info.cmdline_paddr; @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void) * environment (i.e. hardware_subarch 0). */ pvh_bootparams.hdr.version = 0x212; - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ + pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0; } /* @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void) */ void __init xen_prepare_pvh(void) { - u32 msr; + + u32 msr = xen_cpuid_base(); u64 pfn; + bool xen_guest = msr ? true : false; if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", - pvh_start_info.magic); + if (xen_guest) + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", + pvh_start_info.magic); BUG(); } - xen_pvh = 1; + if (xen_guest) { + xen_pvh = 1; + + msr = cpuid_ebx(msr + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + } else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) { + BUG(); + } - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + init_pvh_bootparams(xen_guest); - init_pvh_bootparams(); + if (xen_guest) + x86_init.oem.arch_setup = xen_pvh_arch_setup; - x86_init.oem.arch_setup = xen_pvh_arch_setup; }