diff mbox

[RFC] KVM: x86: Allow Qemu/KVM to use PVH entry point

Message ID 1511897682-32060-1-git-send-email-maran.wilson@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Maran Wilson Nov. 28, 2017, 7:34 p.m. UTC
For certain applications it is desirable to rapidly boot a KVM virtual
machine. In cases where legacy hardware and software support within the
guest is not needed, Qemu should be able to boot directly into the
uncompressed Linux kernel binary without the need to run firmware.

There already exists an ABI to allow this for Xen PVH guests and the ABI is
supported by Linux and FreeBSD:

   https://xenbits.xen.org/docs/unstable/misc/hvmlite.html

This PoC patch enables Qemu to use that same entry point for booting KVM
guests.

Even though the code is still PoC quality, I'm sending this as an RFC now
since there are a number of different ways the specific implementation
details can be handled. I chose a shared code path for Xen and KVM guests
but could just as easily create a separate code path that is advertised by
a different ELF note for KVM. There also seems to be some flexibility in
how the e820 table data is passed and how (or if) it should be identified
as e820 data. As a starting point, I've chosen the options that seem to
result in the smallest patch with minimal to no changes required of the
x86/HVM direct boot ABI.
---
 arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------
 1 file changed, 55 insertions(+), 19 deletions(-)

Comments

Andrew Cooper Nov. 28, 2017, 7:41 p.m. UTC | #1
On 28/11/17 19:34, Maran Wilson wrote:
> For certain applications it is desirable to rapidly boot a KVM virtual
> machine. In cases where legacy hardware and software support within the
> guest is not needed, Qemu should be able to boot directly into the
> uncompressed Linux kernel binary without the need to run firmware.
>
> There already exists an ABI to allow this for Xen PVH guests and the ABI is
> supported by Linux and FreeBSD:
>
>    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html

Just FYI, this link has recently become stale, following some cleanup. 
The document is now:

https://xenbits.xen.org/docs/unstable/misc/pvh.html

~Andrew

>
> This PoC patch enables Qemu to use that same entry point for booting KVM
> guests.
>
> Even though the code is still PoC quality, I'm sending this as an RFC now
> since there are a number of different ways the specific implementation
> details can be handled. I chose a shared code path for Xen and KVM guests
> but could just as easily create a separate code path that is advertised by
> a different ELF note for KVM. There also seems to be some flexibility in
> how the e820 table data is passed and how (or if) it should be identified
> as e820 data. As a starting point, I've chosen the options that seem to
> result in the smallest patch with minimal to no changes required of the
> x86/HVM direct boot ABI.
> ---
>  arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------
>  1 file changed, 55 insertions(+), 19 deletions(-)
>
> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
> index 98ab176..d93f711 100644
> --- a/arch/x86/xen/enlighten_pvh.c
> +++ b/arch/x86/xen/enlighten_pvh.c
> @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void)
>  		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
>  }
>  
> -static void __init init_pvh_bootparams(void)
> +static void __init init_pvh_bootparams(bool xen_guest)
>  {
>  	struct xen_memory_map memmap;
>  	int rc;
>  
>  	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
>  
> -	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> -	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> -	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> -	if (rc) {
> -		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> -		BUG();
> +	if (xen_guest) {
> +		memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> +		set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> +		rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> +		if (rc) {
> +			xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> +			BUG();
> +		}
> +		pvh_bootparams.e820_entries = memmap.nr_entries;
> +	} else if (pvh_start_info.nr_modules > 1) {
> +		/* The second module should be the e820 data for KVM guests */
> +		struct hvm_modlist_entry *modaddr;
> +		char e820_sig[] = "e820 data";
> +		struct boot_e820_entry *ep;
> +		struct e820_table *tp;
> +		char *cmdline_str;
> +		int idx;
> +
> +		modaddr = __va(pvh_start_info.modlist_paddr +
> +			       sizeof(struct hvm_modlist_entry));
> +		cmdline_str = __va(modaddr->cmdline_paddr);
> +
> +		if ((modaddr->cmdline_paddr) &&
> +		    (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) {
> +			tp = __va(modaddr->paddr);
> +			ep = (struct boot_e820_entry *)tp->entries;
> +
> +			pvh_bootparams.e820_entries = tp->nr_entries;
> +
> +			for (idx = 0; idx < tp->nr_entries ; idx++, ep++)
> +				pvh_bootparams.e820_table[idx] = *ep;
> +		}
>  	}
> -	pvh_bootparams.e820_entries = memmap.nr_entries;
>  
>  	if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
>  		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
> @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void)
>  		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
>  			E820_TYPE_RESERVED;
>  		pvh_bootparams.e820_entries++;
> -	} else
> +	} else if (xen_guest) {
>  		xen_raw_printk("Warning: Can fit ISA range into e820\n");
> +	}
>  
>  	pvh_bootparams.hdr.cmd_line_ptr =
>  		pvh_start_info.cmdline_paddr;
> @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void)
>  	 * environment (i.e. hardware_subarch 0).
>  	 */
>  	pvh_bootparams.hdr.version = 0x212;
> -	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
> +	pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0;
>  }
>  
>  /*
> @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void)
>   */
>  void __init xen_prepare_pvh(void)
>  {
> -	u32 msr;
> +
> +	u32 msr = xen_cpuid_base();
>  	u64 pfn;
> +	bool xen_guest = msr ? true : false;
>  
>  	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
> -		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> -				pvh_start_info.magic);
> +		if (xen_guest)
> +			xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> +					pvh_start_info.magic);
>  		BUG();
>  	}
>  
> -	xen_pvh = 1;
> +	if (xen_guest) {
> +		xen_pvh = 1;
> +
> +		msr = cpuid_ebx(msr + 2);
> +		pfn = __pa(hypercall_page);
> +		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +
> +	} else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) {
> +		BUG();
> +	}
>  
> -	msr = cpuid_ebx(xen_cpuid_base() + 2);
> -	pfn = __pa(hypercall_page);
> -	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +	init_pvh_bootparams(xen_guest);
>  
> -	init_pvh_bootparams();
> +	if (xen_guest)
> +		x86_init.oem.arch_setup = xen_pvh_arch_setup;
>  
> -	x86_init.oem.arch_setup = xen_pvh_arch_setup;
>  }
Christoph Hellwig Nov. 28, 2017, 7:58 p.m. UTC | #2
On Tue, Nov 28, 2017 at 11:34:42AM -0800, Maran Wilson wrote:
> This PoC patch enables Qemu to use that same entry point for booting KVM
> guests.

Nice.  I do a a lot of -kernel boots in qemu/kvm for testing, and
speeding this further up would be great.
Juergen Gross Nov. 29, 2017, 8:21 a.m. UTC | #3
On 28/11/17 20:34, Maran Wilson wrote:
> For certain applications it is desirable to rapidly boot a KVM virtual
> machine. In cases where legacy hardware and software support within the
> guest is not needed, Qemu should be able to boot directly into the
> uncompressed Linux kernel binary without the need to run firmware.
> 
> There already exists an ABI to allow this for Xen PVH guests and the ABI is
> supported by Linux and FreeBSD:
> 
>    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
> 
> This PoC patch enables Qemu to use that same entry point for booting KVM
> guests.
> 
> Even though the code is still PoC quality, I'm sending this as an RFC now
> since there are a number of different ways the specific implementation
> details can be handled. I chose a shared code path for Xen and KVM guests
> but could just as easily create a separate code path that is advertised by
> a different ELF note for KVM. There also seems to be some flexibility in
> how the e820 table data is passed and how (or if) it should be identified
> as e820 data. As a starting point, I've chosen the options that seem to
> result in the smallest patch with minimal to no changes required of the
> x86/HVM direct boot ABI.

I like the idea.

I'd rather split up the different hypervisor types early and use a
common set of service functions instead of special casing xen_guest
everywhere. This would make it much easier to support the KVM PVH
boot without the need to configure the kernel with CONFIG_XEN.

Another option would be to use the same boot path as with grub: set
the boot params in zeropage and start at startup_32.


Juergen

> ---
>  arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------
>  1 file changed, 55 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
> index 98ab176..d93f711 100644
> --- a/arch/x86/xen/enlighten_pvh.c
> +++ b/arch/x86/xen/enlighten_pvh.c
> @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void)
>  		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
>  }
>  
> -static void __init init_pvh_bootparams(void)
> +static void __init init_pvh_bootparams(bool xen_guest)
>  {
>  	struct xen_memory_map memmap;
>  	int rc;
>  
>  	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
>  
> -	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> -	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> -	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> -	if (rc) {
> -		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> -		BUG();
> +	if (xen_guest) {
> +		memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> +		set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> +		rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> +		if (rc) {
> +			xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> +			BUG();
> +		}
> +		pvh_bootparams.e820_entries = memmap.nr_entries;
> +	} else if (pvh_start_info.nr_modules > 1) {
> +		/* The second module should be the e820 data for KVM guests */
> +		struct hvm_modlist_entry *modaddr;
> +		char e820_sig[] = "e820 data";
> +		struct boot_e820_entry *ep;
> +		struct e820_table *tp;
> +		char *cmdline_str;
> +		int idx;
> +
> +		modaddr = __va(pvh_start_info.modlist_paddr +
> +			       sizeof(struct hvm_modlist_entry));
> +		cmdline_str = __va(modaddr->cmdline_paddr);
> +
> +		if ((modaddr->cmdline_paddr) &&
> +		    (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) {
> +			tp = __va(modaddr->paddr);
> +			ep = (struct boot_e820_entry *)tp->entries;
> +
> +			pvh_bootparams.e820_entries = tp->nr_entries;
> +
> +			for (idx = 0; idx < tp->nr_entries ; idx++, ep++)
> +				pvh_bootparams.e820_table[idx] = *ep;
> +		}
>  	}
> -	pvh_bootparams.e820_entries = memmap.nr_entries;
>  
>  	if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
>  		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
> @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void)
>  		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
>  			E820_TYPE_RESERVED;
>  		pvh_bootparams.e820_entries++;
> -	} else
> +	} else if (xen_guest) {
>  		xen_raw_printk("Warning: Can fit ISA range into e820\n");
> +	}
>  
>  	pvh_bootparams.hdr.cmd_line_ptr =
>  		pvh_start_info.cmdline_paddr;
> @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void)
>  	 * environment (i.e. hardware_subarch 0).
>  	 */
>  	pvh_bootparams.hdr.version = 0x212;
> -	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
> +	pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0;
>  }
>  
>  /*
> @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void)
>   */
>  void __init xen_prepare_pvh(void)
>  {
> -	u32 msr;
> +
> +	u32 msr = xen_cpuid_base();
>  	u64 pfn;
> +	bool xen_guest = msr ? true : false;
>  
>  	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
> -		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> -				pvh_start_info.magic);
> +		if (xen_guest)
> +			xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> +					pvh_start_info.magic);
>  		BUG();
>  	}
>  
> -	xen_pvh = 1;
> +	if (xen_guest) {
> +		xen_pvh = 1;
> +
> +		msr = cpuid_ebx(msr + 2);
> +		pfn = __pa(hypercall_page);
> +		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +
> +	} else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) {
> +		BUG();
> +	}
>  
> -	msr = cpuid_ebx(xen_cpuid_base() + 2);
> -	pfn = __pa(hypercall_page);
> -	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +	init_pvh_bootparams(xen_guest);
>  
> -	init_pvh_bootparams();
> +	if (xen_guest)
> +		x86_init.oem.arch_setup = xen_pvh_arch_setup;
>  
> -	x86_init.oem.arch_setup = xen_pvh_arch_setup;
>  }
>
Roger Pau Monné Nov. 29, 2017, 8:50 a.m. UTC | #4
On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote:
> On 28/11/17 20:34, Maran Wilson wrote:
> > For certain applications it is desirable to rapidly boot a KVM virtual
> > machine. In cases where legacy hardware and software support within the
> > guest is not needed, Qemu should be able to boot directly into the
> > uncompressed Linux kernel binary without the need to run firmware.
> > 
> > There already exists an ABI to allow this for Xen PVH guests and the ABI is
> > supported by Linux and FreeBSD:
> > 
> >    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html

I would also add a link to:

http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info

> > This PoC patch enables Qemu to use that same entry point for booting KVM
> > guests.
> > 
> > Even though the code is still PoC quality, I'm sending this as an RFC now
> > since there are a number of different ways the specific implementation
> > details can be handled. I chose a shared code path for Xen and KVM guests
> > but could just as easily create a separate code path that is advertised by
> > a different ELF note for KVM. There also seems to be some flexibility in
> > how the e820 table data is passed and how (or if) it should be identified
> > as e820 data. As a starting point, I've chosen the options that seem to
> > result in the smallest patch with minimal to no changes required of the
> > x86/HVM direct boot ABI.
> 
> I like the idea.
> 
> I'd rather split up the different hypervisor types early and use a
> common set of service functions instead of special casing xen_guest
> everywhere. This would make it much easier to support the KVM PVH
> boot without the need to configure the kernel with CONFIG_XEN.
> 
> Another option would be to use the same boot path as with grub: set
> the boot params in zeropage and start at startup_32.

I think I prefer this approach since AFAICT it should allow for
greater code share with the common boot path.

> 
> Juergen
> 
> > ---
> >  arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------
> >  1 file changed, 55 insertions(+), 19 deletions(-)
> > 
> > diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
> > index 98ab176..d93f711 100644
> > --- a/arch/x86/xen/enlighten_pvh.c
> > +++ b/arch/x86/xen/enlighten_pvh.c
> > @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void)
> >  		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
> >  }
> >  
> > -static void __init init_pvh_bootparams(void)
> > +static void __init init_pvh_bootparams(bool xen_guest)
> >  {
> >  	struct xen_memory_map memmap;
> >  	int rc;
> >  
> >  	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
> >  
> > -	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> > -	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> > -	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> > -	if (rc) {
> > -		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> > -		BUG();
> > +	if (xen_guest) {
> > +		memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
> > +		set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
> > +		rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> > +		if (rc) {
> > +			xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> > +			BUG();
> > +		}
> > +		pvh_bootparams.e820_entries = memmap.nr_entries;
> > +	} else if (pvh_start_info.nr_modules > 1) {
> > +		/* The second module should be the e820 data for KVM guests */

I don't think this is desirable. You might want to boot other OSes
using this method, and they might want to pass more than one module.

IMHO the hvm_start_info structure should be bumped to contain a
pointer to the memory map. Note that there's a 'version' field that
can be used for that. Even on Xen we might want to pass the memory map
in such a way instead of using the hypercall.

Thanks, Roger.
Paolo Bonzini Nov. 29, 2017, 8:59 a.m. UTC | #5
On 28/11/2017 20:34, Maran Wilson wrote:
> For certain applications it is desirable to rapidly boot a KVM virtual
> machine. In cases where legacy hardware and software support within the
> guest is not needed, Qemu should be able to boot directly into the
> uncompressed Linux kernel binary without the need to run firmware.
> 
> There already exists an ABI to allow this for Xen PVH guests and the ABI is
> supported by Linux and FreeBSD:
> 
>    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
> 
> This PoC patch enables Qemu to use that same entry point for booting KVM
> guests.

Nice!  So QEMU would parse the ELF file just like for multiboot, find
the ELF note, and then prepare an hvmlite boot info struct instead of
the multiboot one?  There would then be a new option ROM, very similar
to multiboot.S.

Thanks,

Paolo
Boris Ostrovsky Nov. 29, 2017, 2:03 p.m. UTC | #6
On 11/29/2017 03:50 AM, Roger Pau Monné wrote:
> On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote:
>> On 28/11/17 20:34, Maran Wilson wrote:
>>> For certain applications it is desirable to rapidly boot a KVM virtual
>>> machine. In cases where legacy hardware and software support within the
>>> guest is not needed, Qemu should be able to boot directly into the
>>> uncompressed Linux kernel binary without the need to run firmware.
>>>
>>> There already exists an ABI to allow this for Xen PVH guests and the ABI is
>>> supported by Linux and FreeBSD:
>>>
>>>    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
> I would also add a link to:
>
> http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info
>
>>> This PoC patch enables Qemu to use that same entry point for booting KVM
>>> guests.
>>>
>>> Even though the code is still PoC quality, I'm sending this as an RFC now
>>> since there are a number of different ways the specific implementation
>>> details can be handled. I chose a shared code path for Xen and KVM guests
>>> but could just as easily create a separate code path that is advertised by
>>> a different ELF note for KVM. There also seems to be some flexibility in
>>> how the e820 table data is passed and how (or if) it should be identified
>>> as e820 data. As a starting point, I've chosen the options that seem to
>>> result in the smallest patch with minimal to no changes required of the
>>> x86/HVM direct boot ABI.
>> I like the idea.
>>
>> I'd rather split up the different hypervisor types early and use a
>> common set of service functions instead of special casing xen_guest
>> everywhere. This would make it much easier to support the KVM PVH
>> boot without the need to configure the kernel with CONFIG_XEN.
>>
>> Another option would be to use the same boot path as with grub: set
>> the boot params in zeropage and start at startup_32.
> I think I prefer this approach since AFAICT it should allow for
> greater code share with the common boot path.

zeropage is x86/Linux-specific so we'd need some sort of firmware (like
grub) between a hypervisor and Linux to convert hvm_start_info to
bootparams.

-boris
Juergen Gross Nov. 29, 2017, 2:11 p.m. UTC | #7
On 29/11/17 15:03, Boris Ostrovsky wrote:
> On 11/29/2017 03:50 AM, Roger Pau Monné wrote:
>> On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote:
>>> On 28/11/17 20:34, Maran Wilson wrote:
>>>> For certain applications it is desirable to rapidly boot a KVM virtual
>>>> machine. In cases where legacy hardware and software support within the
>>>> guest is not needed, Qemu should be able to boot directly into the
>>>> uncompressed Linux kernel binary without the need to run firmware.
>>>>
>>>> There already exists an ABI to allow this for Xen PVH guests and the ABI is
>>>> supported by Linux and FreeBSD:
>>>>
>>>>    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
>> I would also add a link to:
>>
>> http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info
>>
>>>> This PoC patch enables Qemu to use that same entry point for booting KVM
>>>> guests.
>>>>
>>>> Even though the code is still PoC quality, I'm sending this as an RFC now
>>>> since there are a number of different ways the specific implementation
>>>> details can be handled. I chose a shared code path for Xen and KVM guests
>>>> but could just as easily create a separate code path that is advertised by
>>>> a different ELF note for KVM. There also seems to be some flexibility in
>>>> how the e820 table data is passed and how (or if) it should be identified
>>>> as e820 data. As a starting point, I've chosen the options that seem to
>>>> result in the smallest patch with minimal to no changes required of the
>>>> x86/HVM direct boot ABI.
>>> I like the idea.
>>>
>>> I'd rather split up the different hypervisor types early and use a
>>> common set of service functions instead of special casing xen_guest
>>> everywhere. This would make it much easier to support the KVM PVH
>>> boot without the need to configure the kernel with CONFIG_XEN.
>>>
>>> Another option would be to use the same boot path as with grub: set
>>> the boot params in zeropage and start at startup_32.
>> I think I prefer this approach since AFAICT it should allow for
>> greater code share with the common boot path.
> 
> zeropage is x86/Linux-specific so we'd need some sort of firmware (like
> grub) between a hypervisor and Linux to convert hvm_start_info to
> bootparams.

qemu?


Juergen
Roger Pau Monné Nov. 29, 2017, 2:18 p.m. UTC | #8
On Wed, Nov 29, 2017 at 03:11:12PM +0100, Juergen Gross wrote:
> On 29/11/17 15:03, Boris Ostrovsky wrote:
> > On 11/29/2017 03:50 AM, Roger Pau Monné wrote:
> >> On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote:
> >>> On 28/11/17 20:34, Maran Wilson wrote:
> >>>> For certain applications it is desirable to rapidly boot a KVM virtual
> >>>> machine. In cases where legacy hardware and software support within the
> >>>> guest is not needed, Qemu should be able to boot directly into the
> >>>> uncompressed Linux kernel binary without the need to run firmware.
> >>>>
> >>>> There already exists an ABI to allow this for Xen PVH guests and the ABI is
> >>>> supported by Linux and FreeBSD:
> >>>>
> >>>>    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
> >> I would also add a link to:
> >>
> >> http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info
> >>
> >>>> This PoC patch enables Qemu to use that same entry point for booting KVM
> >>>> guests.
> >>>>
> >>>> Even though the code is still PoC quality, I'm sending this as an RFC now
> >>>> since there are a number of different ways the specific implementation
> >>>> details can be handled. I chose a shared code path for Xen and KVM guests
> >>>> but could just as easily create a separate code path that is advertised by
> >>>> a different ELF note for KVM. There also seems to be some flexibility in
> >>>> how the e820 table data is passed and how (or if) it should be identified
> >>>> as e820 data. As a starting point, I've chosen the options that seem to
> >>>> result in the smallest patch with minimal to no changes required of the
> >>>> x86/HVM direct boot ABI.
> >>> I like the idea.
> >>>
> >>> I'd rather split up the different hypervisor types early and use a
> >>> common set of service functions instead of special casing xen_guest
> >>> everywhere. This would make it much easier to support the KVM PVH
> >>> boot without the need to configure the kernel with CONFIG_XEN.
> >>>
> >>> Another option would be to use the same boot path as with grub: set
> >>> the boot params in zeropage and start at startup_32.
> >> I think I prefer this approach since AFAICT it should allow for
> >> greater code share with the common boot path.
> > 
> > zeropage is x86/Linux-specific so we'd need some sort of firmware (like
> > grub) between a hypervisor and Linux to convert hvm_start_info to
> > bootparams.
> 
> qemu?

But then it won't be using the PVH entry point, and would just use the
native one?

My understanding was that the PVH shim inside of Linux will prepare a
zero-page when booted using the PVH entry point, and then jump into
the native boot path.

Roger.
Boris Ostrovsky Nov. 29, 2017, 2:25 p.m. UTC | #9
On 11/29/2017 09:18 AM, Roger Pau Monné wrote:
> On Wed, Nov 29, 2017 at 03:11:12PM +0100, Juergen Gross wrote:
>> On 29/11/17 15:03, Boris Ostrovsky wrote:
>>> On 11/29/2017 03:50 AM, Roger Pau Monné wrote:
>>>> On Wed, Nov 29, 2017 at 09:21:59AM +0100, Juergen Gross wrote:
>>>>> On 28/11/17 20:34, Maran Wilson wrote:
>>>>>> For certain applications it is desirable to rapidly boot a KVM virtual
>>>>>> machine. In cases where legacy hardware and software support within the
>>>>>> guest is not needed, Qemu should be able to boot directly into the
>>>>>> uncompressed Linux kernel binary without the need to run firmware.
>>>>>>
>>>>>> There already exists an ABI to allow this for Xen PVH guests and the ABI is
>>>>>> supported by Linux and FreeBSD:
>>>>>>
>>>>>>    https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
>>>> I would also add a link to:
>>>>
>>>> http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,arch-x86,hvm,start_info.h.html#Struct_hvm_start_info
>>>>
>>>>>> This PoC patch enables Qemu to use that same entry point for booting KVM
>>>>>> guests.
>>>>>>
>>>>>> Even though the code is still PoC quality, I'm sending this as an RFC now
>>>>>> since there are a number of different ways the specific implementation
>>>>>> details can be handled. I chose a shared code path for Xen and KVM guests
>>>>>> but could just as easily create a separate code path that is advertised by
>>>>>> a different ELF note for KVM. There also seems to be some flexibility in
>>>>>> how the e820 table data is passed and how (or if) it should be identified
>>>>>> as e820 data. As a starting point, I've chosen the options that seem to
>>>>>> result in the smallest patch with minimal to no changes required of the
>>>>>> x86/HVM direct boot ABI.
>>>>> I like the idea.
>>>>>
>>>>> I'd rather split up the different hypervisor types early and use a
>>>>> common set of service functions instead of special casing xen_guest
>>>>> everywhere. This would make it much easier to support the KVM PVH
>>>>> boot without the need to configure the kernel with CONFIG_XEN.
>>>>>
>>>>> Another option would be to use the same boot path as with grub: set
>>>>> the boot params in zeropage and start at startup_32.
>>>> I think I prefer this approach since AFAICT it should allow for
>>>> greater code share with the common boot path.
>>> zeropage is x86/Linux-specific so we'd need some sort of firmware (like
>>> grub) between a hypervisor and Linux to convert hvm_start_info to
>>> bootparams.
>> qemu?

I think KVM folks didn't want to do this. I can't find the thread but I
believe it was somewhere during Clear Containers discussion. Paolo?


> But then it won't be using the PVH entry point, and would just use the
> native one?
>
> My understanding was that the PVH shim inside of Linux will prepare a
> zero-page when booted using the PVH entry point, and then jump into
> the native boot path.

Right, but that's not what Juergen's second option is. IIUIC with that
option Linux starts with zeropage already prepared. No shim in the kernel.

-boris
Paolo Bonzini Nov. 29, 2017, 2:44 p.m. UTC | #10
On 29/11/2017 15:25, Boris Ostrovsky wrote:
>>>> zeropage is x86/Linux-specific so we'd need some sort of firmware (like
>>>> grub) between a hypervisor and Linux to convert hvm_start_info to
>>>> bootparams.
>>> qemu?
>
> I think KVM folks didn't want to do this. I can't find the thread but I
> believe it was somewhere during Clear Containers discussion. Paolo?

QEMU is the right place to parse the ELF file and save it in memory.
You would have to teach QEMU to find the Xen note in ELF-format kernels
(just like it looks for the multiboot header), and use a different
option ROM ("pvhboot.c" for example).

However I don't like to bypass the BIOS; for -kernel, KVM starts the
guest with an option ROM (linuxboot-dma.c or multiboot.S in QEMU
sources) that takes care of boot.

In either case, you would have a new option ROM.  It could either be
very simple and similar to multiboot.S, or it could be larger and do the
same task as xen-pvh.S and enlighten_pvh.c (then get the address of
startup_32 or startup_64 from FW_CFG_KERNEL_ENTRY and jump there).  The
ugly part is that the option ROM would have to know more details about
what it is going to boot, including for example whether it's 32-bit or
64-bit, so I don't really think it is a good idea.

I actually like this patch, except that I'd get the e820 memory map from
fw_cfg (see the first part of
https://github.com/bonzini/qboot/blob/master/fw_cfg.c, and extract_e820
in https://github.com/bonzini/qboot/blob/master/main.c) instead of the
second module.

Thanks,

Paolo

> 
>> But then it won't be using the PVH entry point, and would just use the
>> native one?
>>
>> My understanding was that the PVH shim inside of Linux will prepare a
>> zero-page when booted using the PVH entry point, and then jump into
>> the native boot path.
> Right, but that's not what Juergen's second option is. IIUIC with that
> option Linux starts with zeropage already prepared. No shim in the kernel.
Juergen Gross Nov. 29, 2017, 2:47 p.m. UTC | #11
On 29/11/17 15:44, Paolo Bonzini wrote:
> On 29/11/2017 15:25, Boris Ostrovsky wrote:
>>>>> zeropage is x86/Linux-specific so we'd need some sort of firmware (like
>>>>> grub) between a hypervisor and Linux to convert hvm_start_info to
>>>>> bootparams.
>>>> qemu?
>>
>> I think KVM folks didn't want to do this. I can't find the thread but I
>> believe it was somewhere during Clear Containers discussion. Paolo?
> 
> QEMU is the right place to parse the ELF file and save it in memory.
> You would have to teach QEMU to find the Xen note in ELF-format kernels
> (just like it looks for the multiboot header), and use a different
> option ROM ("pvhboot.c" for example).
> 
> However I don't like to bypass the BIOS; for -kernel, KVM starts the
> guest with an option ROM (linuxboot-dma.c or multiboot.S in QEMU
> sources) that takes care of boot.
> 
> In either case, you would have a new option ROM.  It could either be
> very simple and similar to multiboot.S, or it could be larger and do the
> same task as xen-pvh.S and enlighten_pvh.c (then get the address of
> startup_32 or startup_64 from FW_CFG_KERNEL_ENTRY and jump there).  The
> ugly part is that the option ROM would have to know more details about
> what it is going to boot, including for example whether it's 32-bit or
> 64-bit, so I don't really think it is a good idea.

As grub2 doesn't have to know, qemu shouldn't have to know either.


Juergen
Paolo Bonzini Nov. 29, 2017, 2:50 p.m. UTC | #12
On 29/11/2017 15:47, Juergen Gross wrote:
> On 29/11/17 15:44, Paolo Bonzini wrote:
>> In either case, you would have a new option ROM.  It could either be
>> very simple and similar to multiboot.S, or it could be larger and do the
>> same task as xen-pvh.S and enlighten_pvh.c (then get the address of
>> startup_32 or startup_64 from FW_CFG_KERNEL_ENTRY and jump there).  The
>> ugly part is that the option ROM would have to know more details about
>> what it is going to boot, including for example whether it's 32-bit or
>> 64-bit, so I don't really think it is a good idea.
> 
> As grub2 doesn't have to know, qemu shouldn't have to know either.

That would be exactly what linuxboot-dma.c does already, but it's slower
than PVH because Linux has to uncompress itself.

The above thought experiment would make QEMU able to boot a PVH kernel
without any changes to Linux.

Paolo
Andrew Cooper Nov. 29, 2017, 2:52 p.m. UTC | #13
On 29/11/17 14:47, Juergen Gross wrote:
> On 29/11/17 15:44, Paolo Bonzini wrote:
>> On 29/11/2017 15:25, Boris Ostrovsky wrote:
>>>>>> zeropage is x86/Linux-specific so we'd need some sort of firmware (like
>>>>>> grub) between a hypervisor and Linux to convert hvm_start_info to
>>>>>> bootparams.
>>>>> qemu?
>>> I think KVM folks didn't want to do this. I can't find the thread but I
>>> believe it was somewhere during Clear Containers discussion. Paolo?
>> QEMU is the right place to parse the ELF file and save it in memory.
>> You would have to teach QEMU to find the Xen note in ELF-format kernels
>> (just like it looks for the multiboot header), and use a different
>> option ROM ("pvhboot.c" for example).
>>
>> However I don't like to bypass the BIOS; for -kernel, KVM starts the
>> guest with an option ROM (linuxboot-dma.c or multiboot.S in QEMU
>> sources) that takes care of boot.
>>
>> In either case, you would have a new option ROM.  It could either be
>> very simple and similar to multiboot.S, or it could be larger and do the
>> same task as xen-pvh.S and enlighten_pvh.c (then get the address of
>> startup_32 or startup_64 from FW_CFG_KERNEL_ENTRY and jump there).  The
>> ugly part is that the option ROM would have to know more details about
>> what it is going to boot, including for example whether it's 32-bit or
>> 64-bit, so I don't really think it is a good idea.
> As grub2 doesn't have to know, qemu shouldn't have to know either.

An underlying requirement for this boot protocol was to remove the
requirement for a priori knowledge of the eventual mode of the guest,
which plagues Xen PV guests.  (One way or another, we need to parse the
kernel which will end up running to work out how to build the domain for
it.)

32bit flat mode is easy to set up, sufficiently large for any reasonable
bootstrapping, and provides no restrictions to what the eventual guest
wants to do.

~Andrew
Maran Wilson Nov. 29, 2017, 5:14 p.m. UTC | #14
On 11/29/2017 12:59 AM, Paolo Bonzini wrote:
> On 28/11/2017 20:34, Maran Wilson wrote:
>> For certain applications it is desirable to rapidly boot a KVM virtual
>> machine. In cases where legacy hardware and software support within the
>> guest is not needed, Qemu should be able to boot directly into the
>> uncompressed Linux kernel binary without the need to run firmware.
>>
>> There already exists an ABI to allow this for Xen PVH guests and the ABI is
>> supported by Linux and FreeBSD:
>>
>>     https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
>>
>> This PoC patch enables Qemu to use that same entry point for booting KVM
>> guests.
> Nice!  So QEMU would parse the ELF file just like for multiboot, find
> the ELF note, and then prepare an hvmlite boot info struct instead of
> the multiboot one?

Yes, exactly.

> There would then be a new option ROM, very similar
> to multiboot.S.

That is one option. I guess this gets into a discussion about the QEMU 
side of the upcoming patches that would follow ...

I'm currently just initializing the CPU state in QEMU for testing since 
there is such minimal (non Linux specific) setup that is required by the 
ABI.  And (borrowing from the Intel clear container patches) that VM 
setup is only performed when user selects the "nofw" option with the q35 
model. But yeah, if folks think it important to move all such machine 
state initialization out of QEMU and into an option ROM, I can look into 
coding it up that way for the QEMU patches.

Thanks,
-Maran

> Thanks,
>
> Paolo
Paolo Bonzini Nov. 29, 2017, 5:16 p.m. UTC | #15
On 29/11/2017 18:14, Maran Wilson wrote:
> That is one option. I guess this gets into a discussion about the QEMU
> side of the upcoming patches that would follow ...
> 
> I'm currently just initializing the CPU state in QEMU for testing since
> there is such minimal (non Linux specific) setup that is required by the
> ABI.  And (borrowing from the Intel clear container patches) that VM
> setup is only performed when user selects the "nofw" option with the q35
> model. But yeah, if folks think it important to move all such machine
> state initialization out of QEMU and into an option ROM, I can look into
> coding it up that way for the QEMU patches.

Yes, please do an option ROM.  I'll take care of porting it to qboot.

Thanks,

Paolo
Maran Wilson Nov. 29, 2017, 5:24 p.m. UTC | #16
On 11/29/2017 12:21 AM, Juergen Gross wrote:
> On 28/11/17 20:34, Maran Wilson wrote:
>> For certain applications it is desirable to rapidly boot a KVM virtual
>> machine. In cases where legacy hardware and software support within the
>> guest is not needed, Qemu should be able to boot directly into the
>> uncompressed Linux kernel binary without the need to run firmware.
>>
>> There already exists an ABI to allow this for Xen PVH guests and the ABI is
>> supported by Linux and FreeBSD:
>>
>>     https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
>>
>> This PoC patch enables Qemu to use that same entry point for booting KVM
>> guests.
>>
>> Even though the code is still PoC quality, I'm sending this as an RFC now
>> since there are a number of different ways the specific implementation
>> details can be handled. I chose a shared code path for Xen and KVM guests
>> but could just as easily create a separate code path that is advertised by
>> a different ELF note for KVM. There also seems to be some flexibility in
>> how the e820 table data is passed and how (or if) it should be identified
>> as e820 data. As a starting point, I've chosen the options that seem to
>> result in the smallest patch with minimal to no changes required of the
>> x86/HVM direct boot ABI.
> I like the idea.
>
> I'd rather split up the different hypervisor types early and use a
> common set of service functions instead of special casing xen_guest
> everywhere. This would make it much easier to support the KVM PVH
> boot without the need to configure the kernel with CONFIG_XEN.

Thanks for the feedback. I'll try doing something like that as this 
patch moves from proof of concept to a real proposal.

> Another option would be to use the same boot path as with grub: set
> the boot params in zeropage and start at startup_32.

I think others have already responded about that. The main thing I was 
trying to avoid, was adding any Linux OS specific initialization (like 
zeropage) to QEMU. Especially since this PVH entry point already exists 
in Linux.

Thanks,
-Maran

>
> Juergen
>
>> ---
>>   arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------
>>   1 file changed, 55 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
>> index 98ab176..d93f711 100644
>> --- a/arch/x86/xen/enlighten_pvh.c
>> +++ b/arch/x86/xen/enlighten_pvh.c
>> @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void)
>>   		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
>>   }
>>   
>> -static void __init init_pvh_bootparams(void)
>> +static void __init init_pvh_bootparams(bool xen_guest)
>>   {
>>   	struct xen_memory_map memmap;
>>   	int rc;
>>   
>>   	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
>>   
>> -	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
>> -	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
>> -	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
>> -	if (rc) {
>> -		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
>> -		BUG();
>> +	if (xen_guest) {
>> +		memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
>> +		set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
>> +		rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
>> +		if (rc) {
>> +			xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
>> +			BUG();
>> +		}
>> +		pvh_bootparams.e820_entries = memmap.nr_entries;
>> +	} else if (pvh_start_info.nr_modules > 1) {
>> +		/* The second module should be the e820 data for KVM guests */
>> +		struct hvm_modlist_entry *modaddr;
>> +		char e820_sig[] = "e820 data";
>> +		struct boot_e820_entry *ep;
>> +		struct e820_table *tp;
>> +		char *cmdline_str;
>> +		int idx;
>> +
>> +		modaddr = __va(pvh_start_info.modlist_paddr +
>> +			       sizeof(struct hvm_modlist_entry));
>> +		cmdline_str = __va(modaddr->cmdline_paddr);
>> +
>> +		if ((modaddr->cmdline_paddr) &&
>> +		    (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) {
>> +			tp = __va(modaddr->paddr);
>> +			ep = (struct boot_e820_entry *)tp->entries;
>> +
>> +			pvh_bootparams.e820_entries = tp->nr_entries;
>> +
>> +			for (idx = 0; idx < tp->nr_entries ; idx++, ep++)
>> +				pvh_bootparams.e820_table[idx] = *ep;
>> +		}
>>   	}
>> -	pvh_bootparams.e820_entries = memmap.nr_entries;
>>   
>>   	if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
>>   		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
>> @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void)
>>   		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
>>   			E820_TYPE_RESERVED;
>>   		pvh_bootparams.e820_entries++;
>> -	} else
>> +	} else if (xen_guest) {
>>   		xen_raw_printk("Warning: Can fit ISA range into e820\n");
>> +	}
>>   
>>   	pvh_bootparams.hdr.cmd_line_ptr =
>>   		pvh_start_info.cmdline_paddr;
>> @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void)
>>   	 * environment (i.e. hardware_subarch 0).
>>   	 */
>>   	pvh_bootparams.hdr.version = 0x212;
>> -	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
>> +	pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0;
>>   }
>>   
>>   /*
>> @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void)
>>    */
>>   void __init xen_prepare_pvh(void)
>>   {
>> -	u32 msr;
>> +
>> +	u32 msr = xen_cpuid_base();
>>   	u64 pfn;
>> +	bool xen_guest = msr ? true : false;
>>   
>>   	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
>> -		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
>> -				pvh_start_info.magic);
>> +		if (xen_guest)
>> +			xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
>> +					pvh_start_info.magic);
>>   		BUG();
>>   	}
>>   
>> -	xen_pvh = 1;
>> +	if (xen_guest) {
>> +		xen_pvh = 1;
>> +
>> +		msr = cpuid_ebx(msr + 2);
>> +		pfn = __pa(hypercall_page);
>> +		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
>> +
>> +	} else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) {
>> +		BUG();
>> +	}
>>   
>> -	msr = cpuid_ebx(xen_cpuid_base() + 2);
>> -	pfn = __pa(hypercall_page);
>> -	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
>> +	init_pvh_bootparams(xen_guest);
>>   
>> -	init_pvh_bootparams();
>> +	if (xen_guest)
>> +		x86_init.oem.arch_setup = xen_pvh_arch_setup;
>>   
>> -	x86_init.oem.arch_setup = xen_pvh_arch_setup;
>>   }
>>
Maran Wilson Nov. 30, 2017, 6:23 p.m. UTC | #17
On 11/29/2017 6:44 AM, Paolo Bonzini wrote:
> I actually like this patch, except that I'd get the e820 memory map from
> fw_cfg (see the first part of
> https://github.com/bonzini/qboot/blob/master/fw_cfg.c, and extract_e820
> inhttps://github.com/bonzini/qboot/blob/master/main.c) instead of the
> second module.

Hi Paolo,

I want to make sure I understand exactly what you are suggesting...

Are you saying the Linux PVH entry code (such as init_pvh_bootparams()) 
should use the fw_cfg interface to read the e820 memory map data and put 
it into the zeropage? Basically, keeping the patch very much like it 
already is, just extracting the e820 data via the fw_cfg interface 
instead of from the second module of start_info struct?

If that is the case, I guess I'm a bit hesitant to throw the QEMU 
specific fw_cfg interface into the mix on the Linux PVH side when the 
existing PVH ABI already seems to contain an interface for passing 
modules/blobs to the guest. But if you feel there is a compelling reason 
to use the fw_cfg interface here, I'm happy to explore that approach 
further.

Thanks,
-Maran
diff mbox

Patch

diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 98ab176..d93f711 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -31,21 +31,46 @@  static void xen_pvh_arch_setup(void)
 		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
 }
 
-static void __init init_pvh_bootparams(void)
+static void __init init_pvh_bootparams(bool xen_guest)
 {
 	struct xen_memory_map memmap;
 	int rc;
 
 	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
 
-	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
-	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
-	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
-	if (rc) {
-		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
-		BUG();
+	if (xen_guest) {
+		memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
+		set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
+		rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+		if (rc) {
+			xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
+			BUG();
+		}
+		pvh_bootparams.e820_entries = memmap.nr_entries;
+	} else if (pvh_start_info.nr_modules > 1) {
+		/* The second module should be the e820 data for KVM guests */
+		struct hvm_modlist_entry *modaddr;
+		char e820_sig[] = "e820 data";
+		struct boot_e820_entry *ep;
+		struct e820_table *tp;
+		char *cmdline_str;
+		int idx;
+
+		modaddr = __va(pvh_start_info.modlist_paddr +
+			       sizeof(struct hvm_modlist_entry));
+		cmdline_str = __va(modaddr->cmdline_paddr);
+
+		if ((modaddr->cmdline_paddr) &&
+		    (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) {
+			tp = __va(modaddr->paddr);
+			ep = (struct boot_e820_entry *)tp->entries;
+
+			pvh_bootparams.e820_entries = tp->nr_entries;
+
+			for (idx = 0; idx < tp->nr_entries ; idx++, ep++)
+				pvh_bootparams.e820_table[idx] = *ep;
+		}
 	}
-	pvh_bootparams.e820_entries = memmap.nr_entries;
 
 	if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
 		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
@@ -55,8 +80,9 @@  static void __init init_pvh_bootparams(void)
 		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
 			E820_TYPE_RESERVED;
 		pvh_bootparams.e820_entries++;
-	} else
+	} else if (xen_guest) {
 		xen_raw_printk("Warning: Can fit ISA range into e820\n");
+	}
 
 	pvh_bootparams.hdr.cmd_line_ptr =
 		pvh_start_info.cmdline_paddr;
@@ -76,7 +102,7 @@  static void __init init_pvh_bootparams(void)
 	 * environment (i.e. hardware_subarch 0).
 	 */
 	pvh_bootparams.hdr.version = 0x212;
-	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
+	pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0;
 }
 
 /*
@@ -85,22 +111,32 @@  static void __init init_pvh_bootparams(void)
  */
 void __init xen_prepare_pvh(void)
 {
-	u32 msr;
+
+	u32 msr = xen_cpuid_base();
 	u64 pfn;
+	bool xen_guest = msr ? true : false;
 
 	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
-		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
-				pvh_start_info.magic);
+		if (xen_guest)
+			xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
+					pvh_start_info.magic);
 		BUG();
 	}
 
-	xen_pvh = 1;
+	if (xen_guest) {
+		xen_pvh = 1;
+
+		msr = cpuid_ebx(msr + 2);
+		pfn = __pa(hypercall_page);
+		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	} else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) {
+		BUG();
+	}
 
-	msr = cpuid_ebx(xen_cpuid_base() + 2);
-	pfn = __pa(hypercall_page);
-	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+	init_pvh_bootparams(xen_guest);
 
-	init_pvh_bootparams();
+	if (xen_guest)
+		x86_init.oem.arch_setup = xen_pvh_arch_setup;
 
-	x86_init.oem.arch_setup = xen_pvh_arch_setup;
 }