diff mbox

[RFC,17/17] KVM: PPC: Add an ioctl for userspace to select which platform to emulate

Message ID 20110629104103.GR25406@bloggs.ozlabs.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Paul Mackerras June 29, 2011, 10:41 a.m. UTC
This new ioctl allows userspace to specify what paravirtualization
interface (if any) KVM should implement, what architecture version
the guest virtual processors should conform to, and whether the guest
can be permitted to use a real supervisor mode.

At present the only effect of the ioctl is to indicate whether the
requested emulation is available, but in future it may be used to
select between different emulation techniques (book3s_pr vs. book3s_hv)
or set the CPU compatibility mode for the guest.

If book3s_pr KVM is enabled in the kernel config, then this new
ioctl accepts platform values of KVM_PPC_PV_NONE and KVM_PPC_PV_KVM,
but not KVM_PPC_PV_SPAPR.  If book3s_hv KVM is enabled, then this
ioctl requires that the platform is KVM_PPC_PV_SPAPR and the
guest_arch field contains one of 201 or 206 (for architecture versions
2.01 and 2.06) -- when running on a PPC970, it must contain 201, and
when running on a POWER7, it must contain 206.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 Documentation/virtual/kvm/api.txt   |   35 +++++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/kvm.h      |   15 +++++++++++++++
 arch/powerpc/include/asm/kvm_host.h |    1 +
 arch/powerpc/kvm/powerpc.c          |   28 ++++++++++++++++++++++++++++
 include/linux/kvm.h                 |    1 +
 5 files changed, 80 insertions(+), 0 deletions(-)

Comments

Josh Boyer June 29, 2011, 11:53 a.m. UTC | #1
On Wed, Jun 29, 2011 at 08:41:03PM +1000, Paul Mackerras wrote:
> Documentation/virtual/kvm/api.txt   |   35 +++++++++++++++++++++++++++++++++++
> arch/powerpc/include/asm/kvm.h      |   15 +++++++++++++++
> arch/powerpc/include/asm/kvm_host.h |    1 +
> arch/powerpc/kvm/powerpc.c          |   28 ++++++++++++++++++++++++++++
> include/linux/kvm.h                 |    1 +
> 5 files changed, 80 insertions(+), 0 deletions(-)
>
>diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>index b0e4b9c..3ab012c 100644
>--- a/Documentation/virtual/kvm/api.txt
>+++ b/Documentation/virtual/kvm/api.txt
>@@ -1430,6 +1430,41 @@ is supported; 2 if the processor requires all virtual machines to have
> an RMA, or 1 if the processor can use an RMA but doesn't require it,
> because it supports the Virtual RMA (VRMA) facility.
>
>+4.64 KVM_PPC_SET_PLATFORM
>+
>+Capability: none
>+Architectures: powerpc
>+Type: vm ioctl
>+Parameters: struct kvm_ppc_set_platform (in)
>+Returns: 0, or -1 on error
>+
>+This is used by userspace to tell KVM what sort of platform it should
>+emulate.  The return value of the ioctl tells userspace whether the
>+emulation it is requesting is supported by KVM.
>+
>+struct kvm_ppc_set_platform {
>+	__u16 platform;		/* defines the OS/hypervisor ABI */
>+	__u16 guest_arch;	/* e.g. decimal 206 for v2.06 */
>+	__u32 flags;
>+};
>+
>+/* Values for platform */
>+#define KVM_PPC_PV_NONE		0	/* bare-metal, non-paravirtualized */
>+#define KVM_PPC_PV_KVM		1	/* as defined in kvm_para.h */
>+#define KVM_PPC_PV_SPAPR	2	/* IBM Server PAPR (a la PowerVM) */
>+
>+/* Values for flags */
>+#define KVM_PPC_CROSS_ARCH	1	/* guest architecture != host */
>+
>+The KVM_PPC_CROSS_ARCH bit being 1 indicates that the guest is of a
>+sufficiently different architecture to the host that the guest cannot
>+be permitted to use supervisor mode.  For example, if the host is a
>+64-bit machine and the guest is a 32-bit machine, then this bit should
>+be set.

This makes me wonder if a similar thing might eventually be usable for
running an i686 or x32 guest on an x86_64 KVM host.  I have no idea if
that is even theoretically possible, but if it is it might be better to
rename the ioctl to be architecture agnostic.

josh
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf June 29, 2011, 11:56 a.m. UTC | #2
On 29.06.2011, at 13:53, Josh Boyer wrote:

> On Wed, Jun 29, 2011 at 08:41:03PM +1000, Paul Mackerras wrote:
>> Documentation/virtual/kvm/api.txt   |   35 +++++++++++++++++++++++++++++++++++
>> arch/powerpc/include/asm/kvm.h      |   15 +++++++++++++++
>> arch/powerpc/include/asm/kvm_host.h |    1 +
>> arch/powerpc/kvm/powerpc.c          |   28 ++++++++++++++++++++++++++++
>> include/linux/kvm.h                 |    1 +
>> 5 files changed, 80 insertions(+), 0 deletions(-)
>> 
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index b0e4b9c..3ab012c 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -1430,6 +1430,41 @@ is supported; 2 if the processor requires all virtual machines to have
>> an RMA, or 1 if the processor can use an RMA but doesn't require it,
>> because it supports the Virtual RMA (VRMA) facility.
>> 
>> +4.64 KVM_PPC_SET_PLATFORM
>> +
>> +Capability: none
>> +Architectures: powerpc
>> +Type: vm ioctl
>> +Parameters: struct kvm_ppc_set_platform (in)
>> +Returns: 0, or -1 on error
>> +
>> +This is used by userspace to tell KVM what sort of platform it should
>> +emulate.  The return value of the ioctl tells userspace whether the
>> +emulation it is requesting is supported by KVM.
>> +
>> +struct kvm_ppc_set_platform {
>> +	__u16 platform;		/* defines the OS/hypervisor ABI */
>> +	__u16 guest_arch;	/* e.g. decimal 206 for v2.06 */
>> +	__u32 flags;
>> +};
>> +
>> +/* Values for platform */
>> +#define KVM_PPC_PV_NONE		0	/* bare-metal, non-paravirtualized */
>> +#define KVM_PPC_PV_KVM		1	/* as defined in kvm_para.h */
>> +#define KVM_PPC_PV_SPAPR	2	/* IBM Server PAPR (a la PowerVM) */
>> +
>> +/* Values for flags */
>> +#define KVM_PPC_CROSS_ARCH	1	/* guest architecture != host */
>> +
>> +The KVM_PPC_CROSS_ARCH bit being 1 indicates that the guest is of a
>> +sufficiently different architecture to the host that the guest cannot
>> +be permitted to use supervisor mode.  For example, if the host is a
>> +64-bit machine and the guest is a 32-bit machine, then this bit should
>> +be set.
> 
> This makes me wonder if a similar thing might eventually be usable for
> running an i686 or x32 guest on an x86_64 KVM host.  I have no idea if
> that is even theoretically possible, but if it is it might be better to
> rename the ioctl to be architecture agnostic.

On x86 this is not required unless we want to "virtualize" pre-CPUID CPUs. Everything as of Pentium has a full bitmap of feature capabilities that KVM gets from user space, including information such as "Can we do 64-bit mode?".


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josh Boyer June 29, 2011, 11:58 a.m. UTC | #3
On Wed, Jun 29, 2011 at 01:56:16PM +0200, Alexander Graf wrote:
>
>On 29.06.2011, at 13:53, Josh Boyer wrote:
>
>> On Wed, Jun 29, 2011 at 08:41:03PM +1000, Paul Mackerras wrote:
>>> Documentation/virtual/kvm/api.txt   |   35 +++++++++++++++++++++++++++++++++++
>>> arch/powerpc/include/asm/kvm.h      |   15 +++++++++++++++
>>> arch/powerpc/include/asm/kvm_host.h |    1 +
>>> arch/powerpc/kvm/powerpc.c          |   28 ++++++++++++++++++++++++++++
>>> include/linux/kvm.h                 |    1 +
>>> 5 files changed, 80 insertions(+), 0 deletions(-)
>>> 
>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>> index b0e4b9c..3ab012c 100644
>>> --- a/Documentation/virtual/kvm/api.txt
>>> +++ b/Documentation/virtual/kvm/api.txt
>>> @@ -1430,6 +1430,41 @@ is supported; 2 if the processor requires all virtual machines to have
>>> an RMA, or 1 if the processor can use an RMA but doesn't require it,
>>> because it supports the Virtual RMA (VRMA) facility.
>>> 
>>> +4.64 KVM_PPC_SET_PLATFORM
>>> +
>>> +Capability: none
>>> +Architectures: powerpc
>>> +Type: vm ioctl
>>> +Parameters: struct kvm_ppc_set_platform (in)
>>> +Returns: 0, or -1 on error
>>> +
>>> +This is used by userspace to tell KVM what sort of platform it should
>>> +emulate.  The return value of the ioctl tells userspace whether the
>>> +emulation it is requesting is supported by KVM.
>>> +
>>> +struct kvm_ppc_set_platform {
>>> +	__u16 platform;		/* defines the OS/hypervisor ABI */
>>> +	__u16 guest_arch;	/* e.g. decimal 206 for v2.06 */
>>> +	__u32 flags;
>>> +};
>>> +
>>> +/* Values for platform */
>>> +#define KVM_PPC_PV_NONE		0	/* bare-metal, non-paravirtualized */
>>> +#define KVM_PPC_PV_KVM		1	/* as defined in kvm_para.h */
>>> +#define KVM_PPC_PV_SPAPR	2	/* IBM Server PAPR (a la PowerVM) */
>>> +
>>> +/* Values for flags */
>>> +#define KVM_PPC_CROSS_ARCH	1	/* guest architecture != host */
>>> +
>>> +The KVM_PPC_CROSS_ARCH bit being 1 indicates that the guest is of a
>>> +sufficiently different architecture to the host that the guest cannot
>>> +be permitted to use supervisor mode.  For example, if the host is a
>>> +64-bit machine and the guest is a 32-bit machine, then this bit should
>>> +be set.
>> 
>> This makes me wonder if a similar thing might eventually be usable for
>> running an i686 or x32 guest on an x86_64 KVM host.  I have no idea if
>> that is even theoretically possible, but if it is it might be better to
>> rename the ioctl to be architecture agnostic.
>
>On x86 this is not required unless we want to "virtualize" pre-CPUID CPUs. Everything as of Pentium has a full bitmap of feature capabilities that KVM gets from user space, including information such as "Can we do 64-bit mode?".

Ah.  Thank you for the explanation.

josh
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf June 30, 2011, 3:04 p.m. UTC | #4
On 06/29/2011 12:41 PM, Paul Mackerras wrote:
> This new ioctl allows userspace to specify what paravirtualization
> interface (if any) KVM should implement, what architecture version
> the guest virtual processors should conform to, and whether the guest
> can be permitted to use a real supervisor mode.
>
> At present the only effect of the ioctl is to indicate whether the
> requested emulation is available, but in future it may be used to
> select between different emulation techniques (book3s_pr vs. book3s_hv)
> or set the CPU compatibility mode for the guest.
>
> If book3s_pr KVM is enabled in the kernel config, then this new
> ioctl accepts platform values of KVM_PPC_PV_NONE and KVM_PPC_PV_KVM,
> but not KVM_PPC_PV_SPAPR.  If book3s_hv KVM is enabled, then this
> ioctl requires that the platform is KVM_PPC_PV_SPAPR and the
> guest_arch field contains one of 201 or 206 (for architecture versions
> 2.01 and 2.06) -- when running on a PPC970, it must contain 201, and
> when running on a POWER7, it must contain 206.
>
> Signed-off-by: Paul Mackerras<paulus@samba.org>
> ---
>   Documentation/virtual/kvm/api.txt   |   35 +++++++++++++++++++++++++++++++++++
>   arch/powerpc/include/asm/kvm.h      |   15 +++++++++++++++
>   arch/powerpc/include/asm/kvm_host.h |    1 +
>   arch/powerpc/kvm/powerpc.c          |   28 ++++++++++++++++++++++++++++
>   include/linux/kvm.h                 |    1 +
>   5 files changed, 80 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index b0e4b9c..3ab012c 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1430,6 +1430,41 @@ is supported; 2 if the processor requires all virtual machines to have
>   an RMA, or 1 if the processor can use an RMA but doesn't require it,
>   because it supports the Virtual RMA (VRMA) facility.
>
> +4.64 KVM_PPC_SET_PLATFORM
> +
> +Capability: none
> +Architectures: powerpc
> +Type: vm ioctl
> +Parameters: struct kvm_ppc_set_platform (in)
> +Returns: 0, or -1 on error
> +
> +This is used by userspace to tell KVM what sort of platform it should
> +emulate.  The return value of the ioctl tells userspace whether the
> +emulation it is requesting is supported by KVM.
> +
> +struct kvm_ppc_set_platform {
> +	__u16 platform;		/* defines the OS/hypervisor ABI */
> +	__u16 guest_arch;	/* e.g. decimal 206 for v2.06 */
> +	__u32 flags;

Please add some padding so we can extend it later if necessary.

> +};
> +
> +/* Values for platform */
> +#define KVM_PPC_PV_NONE		0	/* bare-metal, non-paravirtualized */
> +#define KVM_PPC_PV_KVM		1	/* as defined in kvm_para.h */
> +#define KVM_PPC_PV_SPAPR	2	/* IBM Server PAPR (a la PowerVM) */

We also support BookE which would be useful to also include in the list.
Furthermore, KVM is more of a feature flag than a platform. We can 
easily support KVM extensions on an SPAPR platform, no?

This whole interface also could deprecate the PVR setting one, so we can 
simply include PVR as well and not require kernel space to jump through 
hoops to figure out its capabilities.

And we need to identify 32-bit BookS processors, so we can go into 
32-bit mode when necessary. That should also be a different guest_arch, 
right?

> +
> +/* Values for flags */
> +#define KVM_PPC_CROSS_ARCH	1	/* guest architecture != host */

User space shouldn't have to worry about this one. It's up to the kernel 
to decide that it's cross.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 30, 2011, 3:16 p.m. UTC | #5
On 06/30/2011 06:04 PM, Alexander Graf wrote:
>> +4.64 KVM_PPC_SET_PLATFORM
>> +
>> +Capability: none
>> +Architectures: powerpc
>> +Type: vm ioctl
>> +Parameters: struct kvm_ppc_set_platform (in)
>> +Returns: 0, or -1 on error
>> +
>> +This is used by userspace to tell KVM what sort of platform it should
>> +emulate.  The return value of the ioctl tells userspace whether the
>> +emulation it is requesting is supported by KVM.
>> +
>> +struct kvm_ppc_set_platform {
>> +    __u16 platform;        /* defines the OS/hypervisor ABI */
>> +    __u16 guest_arch;    /* e.g. decimal 206 for v2.06 */
>> +    __u32 flags;
>
>
> Please add some padding so we can extend it later if necessary.

Regarding that.  There's another option - the ioctl code embeds the 
structure size.  So if we extend the ioctl parsing to pad up (or 
truncate down) from the user's size to our size, and similarly in the 
other direction, we can get away from this ugliness.

Some years ago I posted a generic helper that did this (and also 
kmalloc'ed and kfree'd the data itself), but it wasn't received 
favourably.  Maybe I should try again (and we can possibly use it in kvm 
even if it is rejected for general use, though that's against our 
principles of pushing all generic infrastructure to the wider kernel).
Avi Kivity June 30, 2011, 4 p.m. UTC | #6
On 06/30/2011 06:22 PM, Alexander Graf wrote:
>> Regarding that.  There's another option - the ioctl code embeds the 
>> structure size.  So if we extend the ioctl parsing to pad up (or 
>> truncate down) from the user's size to our size, and similarly in the 
>> other direction, we can get away from this ugliness.
>>
>> Some years ago I posted a generic helper that did this (and also 
>> kmalloc'ed and kfree'd the data itself), but it wasn't received 
>> favourably.  Maybe I should try again (and we can possibly use it in 
>> kvm even if it is rejected for general use, though that's against our 
>> principles of pushing all generic infrastructure to the wider kernel).
>
>
> That does sound interesting, but requires a lot more thought to be put 
> into the actual code, as we basically need to read out the feature 
> bitmap, then provide a minimum size for the chosen features and then 
> decide if they fit in.


Why? just put the things you want in the structure.

old userspace -> new kernel: we auto-zero the parts userspace left out, 
and zero means old behaviour, so everthing works
new userspace -> old kernel: truncate.  Userspace shouldn't have used 
any new features (KVM_CAP), and we can -EINVAL if the truncated section 
contains a nonzero bit.
Alexander Graf June 30, 2011, 4:33 p.m. UTC | #7
On 30.06.2011, at 18:00, Avi Kivity <avi@redhat.com> wrote:

> On 06/30/2011 06:22 PM, Alexander Graf wrote:
>>> Regarding that.  There's another option - the ioctl code embeds the structure size.  So if we extend the ioctl parsing to pad up (or truncate down) from the user's size to our size, and similarly in the other direction, we can get away from this ugliness.
>>> 
>>> Some years ago I posted a generic helper that did this (and also kmalloc'ed and kfree'd the data itself), but it wasn't received favourably.  Maybe I should try again (and we can possibly use it in kvm even if it is rejected for general use, though that's against our principles of pushing all generic infrastructure to the wider kernel).
>> 
>> 
>> That does sound interesting, but requires a lot more thought to be put into the actual code, as we basically need to read out the feature bitmap, then provide a minimum size for the chosen features and then decide if they fit in.
> 
> 
> Why? just put the things you want in the structure.
> 
> old userspace -> new kernel: we auto-zero the parts userspace left out, and zero means old behaviour, so everthing works
> new userspace -> old kernel: truncate.  Userspace shouldn't have used any new features (KVM_CAP), and we can -EINVAL if the truncated section contains a nonzero bit.

Yup, which requires knowledge in the code on what actually fits :). Logic we don't have today.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt June 30, 2011, 11:13 p.m. UTC | #8
> Regarding that.  There's another option - the ioctl code embeds the 
> structure size.  So if we extend the ioctl parsing to pad up (or 
> truncate down) from the user's size to our size, and similarly in the 
> other direction, we can get away from this ugliness.

I don't like relying on that much ... I prefer having an explicit
version in the structure head (or use a flag).

> Some years ago I posted a generic helper that did this (and also 
> kmalloc'ed and kfree'd the data itself), but it wasn't received 
> favourably.  Maybe I should try again (and we can possibly use it in kvm 
> even if it is rejected for general use, though that's against our 
> principles of pushing all generic infrastructure to the wider kernel).

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras July 1, 2011, 10:09 a.m. UTC | #9
On Thu, Jun 30, 2011 at 05:04:23PM +0200, Alexander Graf wrote:
> On 06/29/2011 12:41 PM, Paul Mackerras wrote:
> >+struct kvm_ppc_set_platform {
> >+	__u16 platform;		/* defines the OS/hypervisor ABI */
> >+	__u16 guest_arch;	/* e.g. decimal 206 for v2.06 */
> >+	__u32 flags;
> 
> Please add some padding so we can extend it later if necessary.
> 
> >+};
> >+
> >+/* Values for platform */
> >+#define KVM_PPC_PV_NONE		0	/* bare-metal, non-paravirtualized */
> >+#define KVM_PPC_PV_KVM		1	/* as defined in kvm_para.h */
> >+#define KVM_PPC_PV_SPAPR	2	/* IBM Server PAPR (a la PowerVM) */
> 
> We also support BookE which would be useful to also include in the list.
> Furthermore, KVM is more of a feature flag than a platform. We can
> easily support KVM extensions on an SPAPR platform, no?

Yes, I guess so.  The hypercall sequence will have to be different,
since ordinary system call interrupts go straight to the guest.  But I
guess you've allowed for that with the hypercall sequence property in
the device tree.

> This whole interface also could deprecate the PVR setting one, so we
> can simply include PVR as well and not require kernel space to jump
> through hoops to figure out its capabilities.

I debated about whether to include a PVR value in this structure.

The thing is that POWER7 has the "Processor Compatibility Register"
(PCR), which has a bit which makes the processor behave in user mode
as if it were a POWER6.  So, we could run a book3s_hv guest in POWER6
mode by setting this bit (which we might want to do to run older
distros).  However, this bit doesn't affect the PVR value that the
guest sees.  That's why I went for an architecture level rather than a
specific PVR value.

We could go with a PVR value and use the "logical" PVR values defined
in PAPR to represent architecture levels, e.g. 0x0f000002 for
architecture v2.05 (POWER6).

> And we need to identify 32-bit BookS processors, so we can go into
> 32-bit mode when necessary. That should also be a different
> guest_arch, right?

Right.  If we go with a PVR value then we just use the PVR value for a
suitable 32-bit processor.

> >+
> >+/* Values for flags */
> >+#define KVM_PPC_CROSS_ARCH	1	/* guest architecture != host */
> 
> User space shouldn't have to worry about this one. It's up to the
> kernel to decide that it's cross.

I put that in because we might want to force the use of book3s_pr, for
example if we know we're going to want to do emulated MMIO or
something else that isn't implemented in book3s_hv just yet.

Ultimately, yes, the kernel should be able to decide whether it's
cross or not.  However, I don't think we should make it completely
opaque to userspace as to whether the kernel is using _pr or _hv.
If nothing else, userspace should be able to find out and tell the
user so that performance expectations can be set correctly.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf July 1, 2011, 10:23 a.m. UTC | #10
On 01.07.2011, at 12:09, Paul Mackerras wrote:

> On Thu, Jun 30, 2011 at 05:04:23PM +0200, Alexander Graf wrote:
>> On 06/29/2011 12:41 PM, Paul Mackerras wrote:
>>> +struct kvm_ppc_set_platform {
>>> +	__u16 platform;		/* defines the OS/hypervisor ABI */
>>> +	__u16 guest_arch;	/* e.g. decimal 206 for v2.06 */
>>> +	__u32 flags;
>> 
>> Please add some padding so we can extend it later if necessary.
>> 
>>> +};
>>> +
>>> +/* Values for platform */
>>> +#define KVM_PPC_PV_NONE		0	/* bare-metal, non-paravirtualized */
>>> +#define KVM_PPC_PV_KVM		1	/* as defined in kvm_para.h */
>>> +#define KVM_PPC_PV_SPAPR	2	/* IBM Server PAPR (a la PowerVM) */
>> 
>> We also support BookE which would be useful to also include in the list.
>> Furthermore, KVM is more of a feature flag than a platform. We can
>> easily support KVM extensions on an SPAPR platform, no?
> 
> Yes, I guess so.  The hypercall sequence will have to be different,
> since ordinary system call interrupts go straight to the guest.  But I
> guess you've allowed for that with the hypercall sequence property in
> the device tree.
> 
>> This whole interface also could deprecate the PVR setting one, so we
>> can simply include PVR as well and not require kernel space to jump
>> through hoops to figure out its capabilities.
> 
> I debated about whether to include a PVR value in this structure.
> 
> The thing is that POWER7 has the "Processor Compatibility Register"
> (PCR), which has a bit which makes the processor behave in user mode
> as if it were a POWER6.  So, we could run a book3s_hv guest in POWER6
> mode by setting this bit (which we might want to do to run older
> distros).  However, this bit doesn't affect the PVR value that the
> guest sees.  That's why I went for an architecture level rather than a
> specific PVR value.
> 
> We could go with a PVR value and use the "logical" PVR values defined
> in PAPR to represent architecture levels, e.g. 0x0f000002 for
> architecture v2.05 (POWER6).

IIUC the PVR values are somewhat standardized to contain major and minor revision numbers. Can't we just mask out the minor ones and match for known good systems?

> 
>> And we need to identify 32-bit BookS processors, so we can go into
>> 32-bit mode when necessary. That should also be a different
>> guest_arch, right?
> 
> Right.  If we go with a PVR value then we just use the PVR value for a
> suitable 32-bit processor.

Well, we need to have some way of mapping PVR to arch then. KVM easily supports -cpu G3 and G4. We might also want to have some information on feature flags, such as Altivec or SPE mode available. Or paired singles :). I'm not sure I want to have all that mapping information inside the kernel.

So what we could do is we just provide as much information as we can from user space, including PVR, architecture (2.01 for example), features (32/64-bit, booke/books, fpu, altivec, spe, ...).

> 
>>> +
>>> +/* Values for flags */
>>> +#define KVM_PPC_CROSS_ARCH	1	/* guest architecture != host */
>> 
>> User space shouldn't have to worry about this one. It's up to the
>> kernel to decide that it's cross.
> 
> I put that in because we might want to force the use of book3s_pr, for
> example if we know we're going to want to do emulated MMIO or
> something else that isn't implemented in book3s_hv just yet.

Ah, I see. Well, we could just add a flag to the feature list saying MMIO. If that's impossible to satisfy (HV only), fail the call. Otherwise switch to _pr mode. Later when _hv might be able to support MMIO, we can use it without changing user space.

> Ultimately, yes, the kernel should be able to decide whether it's
> cross or not.  However, I don't think we should make it completely
> opaque to userspace as to whether the kernel is using _pr or _hv.
> If nothing else, userspace should be able to find out and tell the
> user so that performance expectations can be set correctly.

Hrm. Sure, but the decision should be done in kernel land based on all information required to actually make it. And the kernel has more information regarding the system it's running on, so that's the place to actually do the decision. Bubbling it up to user space again is certainly fine by me :).


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 3, 2011, 8:15 a.m. UTC | #11
On 06/30/2011 07:33 PM, Alexander Graf wrote:
> On 30.06.2011, at 18:00, Avi Kivity<avi@redhat.com>  wrote:
>
> >  On 06/30/2011 06:22 PM, Alexander Graf wrote:
> >>>  Regarding that.  There's another option - the ioctl code embeds the structure size.  So if we extend the ioctl parsing to pad up (or truncate down) from the user's size to our size, and similarly in the other direction, we can get away from this ugliness.
> >>>
> >>>  Some years ago I posted a generic helper that did this (and also kmalloc'ed and kfree'd the data itself), but it wasn't received favourably.  Maybe I should try again (and we can possibly use it in kvm even if it is rejected for general use, though that's against our principles of pushing all generic infrastructure to the wider kernel).
> >>
> >>
> >>  That does sound interesting, but requires a lot more thought to be put into the actual code, as we basically need to read out the feature bitmap, then provide a minimum size for the chosen features and then decide if they fit in.
> >
> >
> >  Why? just put the things you want in the structure.
> >
> >  old userspace ->  new kernel: we auto-zero the parts userspace left out, and zero means old behaviour, so everthing works
> >  new userspace ->  old kernel: truncate.  Userspace shouldn't have used any new features (KVM_CAP), and we can -EINVAL if the truncated section contains a nonzero bit.
>
> Yup, which requires knowledge in the code on what actually fits :). Logic we don't have today.

I don't follow.  What knowledge is required?  Please give an example.
Alexander Graf July 3, 2011, 8:34 a.m. UTC | #12
On 03.07.2011, at 10:15, Avi Kivity wrote:

> On 06/30/2011 07:33 PM, Alexander Graf wrote:
>> On 30.06.2011, at 18:00, Avi Kivity<avi@redhat.com>  wrote:
>> 
>> >  On 06/30/2011 06:22 PM, Alexander Graf wrote:
>> >>>  Regarding that.  There's another option - the ioctl code embeds the structure size.  So if we extend the ioctl parsing to pad up (or truncate down) from the user's size to our size, and similarly in the other direction, we can get away from this ugliness.
>> >>>
>> >>>  Some years ago I posted a generic helper that did this (and also kmalloc'ed and kfree'd the data itself), but it wasn't received favourably.  Maybe I should try again (and we can possibly use it in kvm even if it is rejected for general use, though that's against our principles of pushing all generic infrastructure to the wider kernel).
>> >>
>> >>
>> >>  That does sound interesting, but requires a lot more thought to be put into the actual code, as we basically need to read out the feature bitmap, then provide a minimum size for the chosen features and then decide if they fit in.
>> >
>> >
>> >  Why? just put the things you want in the structure.
>> >
>> >  old userspace ->  new kernel: we auto-zero the parts userspace left out, and zero means old behaviour, so everthing works
>> >  new userspace ->  old kernel: truncate.  Userspace shouldn't have used any new features (KVM_CAP), and we can -EINVAL if the truncated section contains a nonzero bit.
>> 
>> Yup, which requires knowledge in the code on what actually fits :). Logic we don't have today.
> 
> I don't follow.  What knowledge is required?  Please give an example.

Sure. Let's take an easy example Currently we have for get_pvinfo:

        case KVM_PPC_GET_PVINFO: {
                struct kvm_ppc_pvinfo pvinfo;
                memset(&pvinfo, 0, sizeof(pvinfo));
                r = kvm_vm_ioctl_get_pvinfo(&pvinfo);
                if (copy_to_user(argp, &pvinfo, sizeof(pvinfo))) {
                        r = -EFAULT;
                        goto out;
                }

                break;
        }

        /* for KVM_PPC_GET_PVINFO */
        struct kvm_ppc_pvinfo {
                /* out */
                __u32 flags;
                __u32 hcall[4];
                __u8  pad[108];
        };

The padding would not be there with your idea. An updated version could look like this:

        /* for KVM_PPC_GET_PVINFO */
        struct kvm_ppc_pvinfo {
                /* out */
                __u32 flags;
                __u32 hcall[4];
                __u64 features;  /* only there with PVINFO_FLAGS_FEATURES */
        };

Now, your idea was to not use copy_from/to_user directly, but instead some wrapper that could pad with zeros on read or truncate on write. So instead we would essentially get:

        int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo, int *required_size)
        {
                [...]
		if (pvinfo_flags & PVINFO_FLAGS_FEATURES) {
                        *required_size = 16;
                } else {
                        *required_size = 8;
                }
                [...]
        }

        case KVM_PPC_GET_PVINFO: {
                struct kvm_ppc_pvinfo pvinfo;
                int required_size = 0;
                memset(&pvinfo, 0, sizeof(pvinfo));
                r = kvm_vm_ioctl_get_pvinfo(&pvinfo, &required_size);
                if (copy_to_user(argp, &pvinfo, required_size) {
                        r = -EFAULT;
                        goto out;
                }

                break;
        }

Otherwise we might write over data the user expected. And that logic that tells to copy_to_user how much data it actually takes to put all the information in is not there today and would have to be added. You can even verify that required_size with the ioctl passed size to make 100% sure user space is sane, but I'd claim that a feature bitmap is plenty of information to ensure that we're not doing something stupid.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity July 3, 2011, 9:12 a.m. UTC | #13
On 07/03/2011 12:09 PM, Alexander Graf wrote:
> >
> >  Right.  The idea is that if KVM_FLAG_BLAH implies a field kvm_struct::blah, then either both are present in the headers, or none  of them.
>
> Yup, makes sense. I like the idea :). Gets rid of all the useless paddings and reserved fields. We could even truncate the structs that already have paddings in them if we only copy min(sizeof(real_struct), ioctl_passed_size); (which we should anyways).
>

No, we can't change anything that is already out.  If will change the 
ioctl numbers, so building against new headers but running against an 
old kernel will fail.

> How long until we get a patch set? :)

Well, I'd really like to get the qemu memory API out first.
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b0e4b9c..3ab012c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1430,6 +1430,41 @@  is supported; 2 if the processor requires all virtual machines to have
 an RMA, or 1 if the processor can use an RMA but doesn't require it,
 because it supports the Virtual RMA (VRMA) facility.
 
+4.64 KVM_PPC_SET_PLATFORM
+
+Capability: none
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_ppc_set_platform (in)
+Returns: 0, or -1 on error
+
+This is used by userspace to tell KVM what sort of platform it should
+emulate.  The return value of the ioctl tells userspace whether the
+emulation it is requesting is supported by KVM.
+
+struct kvm_ppc_set_platform {
+	__u16 platform;		/* defines the OS/hypervisor ABI */
+	__u16 guest_arch;	/* e.g. decimal 206 for v2.06 */
+	__u32 flags;
+};
+
+/* Values for platform */
+#define KVM_PPC_PV_NONE		0	/* bare-metal, non-paravirtualized */
+#define KVM_PPC_PV_KVM		1	/* as defined in kvm_para.h */
+#define KVM_PPC_PV_SPAPR	2	/* IBM Server PAPR (a la PowerVM) */
+
+/* Values for flags */
+#define KVM_PPC_CROSS_ARCH	1	/* guest architecture != host */
+
+The KVM_PPC_CROSS_ARCH bit being 1 indicates that the guest is of a
+sufficiently different architecture to the host that the guest cannot
+be permitted to use supervisor mode.  For example, if the host is a
+64-bit machine and the guest is a 32-bit machine, then this bit should
+be set.
+
+The return value is 0 if KVM supports the requested emulation, or -1
+with errno == EINVAL if not.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index a4f6c85..0dd5cfb 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -287,4 +287,19 @@  struct kvm_allocate_rma {
 	__u64 rma_size;
 };
 
+/* for KVM_PPC_SET_PLATFORM */
+struct kvm_ppc_set_platform {
+	__u16 platform;		/* defines the OS/hypervisor ABI */
+	__u16 guest_arch;	/* e.g. decimal 206 for v2.06 */
+	__u32 flags;
+};
+
+/* Values for platform */
+#define KVM_PPC_PV_NONE		0	/* bare-metal, non-paravirtualized */
+#define KVM_PPC_PV_KVM		1	/* as defined in kvm_para.h */
+#define KVM_PPC_PV_SPAPR	2	/* IBM Server PAPR (a la PowerVM) */
+
+/* Values for flags */
+#define KVM_PPC_CROSS_ARCH	1	/* guest architecture != host */
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index cc22b28..00e7f1b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -167,6 +167,7 @@  struct kvmppc_rma_info {
 };
 
 struct kvm_arch {
+	struct kvm_ppc_set_platform platform;
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
 	unsigned long ram_npages;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a107c9b..83265cd 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -690,6 +690,34 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
+	case KVM_PPC_SET_PLATFORM: {
+		struct kvm_ppc_set_platform plat;
+		struct kvm *kvm = filp->private_data;
+
+		r = -EFAULT;
+		if (copy_from_user(&plat, argp, sizeof(plat)))
+			goto out;
+		r = -EINVAL;
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+		if (plat.platform != KVM_PPC_PV_SPAPR || plat.flags)
+			goto out;
+		/* requested guest arch must match real CPU */
+		if (!((plat.guest_arch == 201 &&
+		       cpu_has_feature(CPU_FTR_ARCH_201)) ||
+		      (plat.guest_arch == 206 &&
+		       cpu_has_feature(CPU_FTR_ARCH_206))))
+			goto out;
+#else
+		if (plat.flags & ~KVM_PPC_CROSS_ARCH)
+			goto out;
+		if (plat.platform != KVM_PPC_PV_NONE &&
+		    plat.platform != KVM_PPC_PV_KVM)
+			goto out;
+#endif
+		kvm->arch.platform = plat;
+		r = 0;
+		break;
+	}
 
 	default:
 		r = -ENOTTY;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2c366b5..e6258e7 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -758,6 +758,7 @@  struct kvm_clock_data {
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
+#define KVM_PPC_SET_PLATFORM	  _IOR(KVMIO,  0xaa, struct kvm_ppc_set_platform)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)