[1/5] kvm: x86: Add payload to kvm_queued_exception and kvm_vcpu_events
diff mbox series

Message ID 20181008182945.79957-1-jmattson@google.com
State New
Headers show
Series
  • [1/5] kvm: x86: Add payload to kvm_queued_exception and kvm_vcpu_events
Related show

Commit Message

Jim Mattson Oct. 8, 2018, 6:29 p.m. UTC
Under nested virtualization, the L1 hypervisor may intercept an
exception raised during the execution of L2 before the exception
is delivered. When the intercepted exception is #PF, the VM-exit
to the L1 hypervisor precedes the modification of CR2. When the
intercepted exception is #DB, the VM-exit to the L1 hypervisor
precedes the modifications of DR6 and DR7 under VMX, but the
VM-exit to the L1 hypervisor follows the modifications of DR6 and
DR7 under SVM.

At present, CR2 is modified too early under both VMX and SVM. DR6 is
modified too early under VMX. DR7 is modified at the appropriate time.
Unfortunately, it is possible to exit to userspace with one of these
exceptions pending, and userspace may rely on the premature
side-effects. It is also possible for userspace to inject one of these
exceptions, in which case, userspace will presumably have already
processed the side-effects.

To address this problem, a new per-VM capability
(KVM_CAP_EXCEPTION_PAYLOAD) will be introduced. When this capability
is enabled by userspace, the faulting linear address will be included
with the information about a pending #PF in L2, and the "new DR6 bits"
will be included with the information about a pending #DB in L2. This
ancillary exception information is carried in a new "payload" field.

Reported-by: Jim Mattson <jmattson@google.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Peter Shier <pshier@google.com>
---
 Documentation/virtual/kvm/api.txt     | 13 +++++++++++--
 arch/x86/include/asm/kvm_host.h       |  3 +++
 arch/x86/include/uapi/asm/kvm.h       |  6 ++++--
 arch/x86/kvm/x86.c                    | 28 ++++++++++++++++++++++++---
 tools/arch/x86/include/uapi/asm/kvm.h |  6 ++++--
 5 files changed, 47 insertions(+), 9 deletions(-)

Comments

Liran Alon Oct. 9, 2018, 9:25 a.m. UTC | #1
> On 8 Oct 2018, at 21:29, Jim Mattson <jmattson@google.com> wrote:
> 
> Under nested virtualization, the L1 hypervisor may intercept an
> exception raised during the execution of L2 before the exception
> is delivered. When the intercepted exception is #PF, the VM-exit
> to the L1 hypervisor precedes the modification of CR2. When the
> intercepted exception is #DB, the VM-exit to the L1 hypervisor
> precedes the modifications of DR6 and DR7 under VMX, but the
> VM-exit to the L1 hypervisor follows the modifications of DR6 and
> DR7 under SVM.
> 
> At present, CR2 is modified too early under both VMX and SVM. DR6 is
> modified too early under VMX. DR7 is modified at the appropriate time.
> Unfortunately, it is possible to exit to userspace with one of these
> exceptions pending, and userspace may rely on the premature
> side-effects. It is also possible for userspace to inject one of these
> exceptions, in which case, userspace will presumably have already
> processed the side-effects.
> 
> To address this problem, a new per-VM capability
> (KVM_CAP_EXCEPTION_PAYLOAD) will be introduced. When this capability
> is enabled by userspace, the faulting linear address will be included
> with the information about a pending #PF in L2, and the "new DR6 bits"
> will be included with the information about a pending #DB in L2. This
> ancillary exception information is carried in a new "payload" field.
> 
> Reported-by: Jim Mattson <jmattson@google.com>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Jim Mattson <jmattson@google.com>
> Reviewed-by: Peter Shier <pshier@google.com>
> ---
> Documentation/virtual/kvm/api.txt     | 13 +++++++++++--
> arch/x86/include/asm/kvm_host.h       |  3 +++
> arch/x86/include/uapi/asm/kvm.h       |  6 ++++--
> arch/x86/kvm/x86.c                    | 28 ++++++++++++++++++++++++---
> tools/arch/x86/include/uapi/asm/kvm.h |  6 ++++--
> 5 files changed, 47 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 647f94128a85..2df2cca81cf5 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -850,7 +850,7 @@ struct kvm_vcpu_events {
> 		__u8 injected;
> 		__u8 nr;
> 		__u8 has_error_code;
> -		__u8 pad;
> +		__u8 has_payload;
> 		__u32 error_code;
> 	} exception;
> 	struct {
> @@ -873,9 +873,11 @@ struct kvm_vcpu_events {
> 		__u8 smm_inside_nmi;
> 		__u8 latched_init;
> 	} smi;
> +	__u32 reserved[7];
> +	__u64 exception_payload;
> };
> 
> -Only two fields are defined in the flags field:
> +The following bits are defined in the flags field:
> 
> - KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
>   interrupt.shadow contains a valid state.
> @@ -883,6 +885,10 @@ Only two fields are defined in the flags field:
> - KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
>   smi contains a valid state.
> 
> +- KVM_VCPUEVENT_VALID_PAYLOAD may be set in the flags field to signal that
> +  exception.has_payload and payload contain valid state
> +  (i.e. KVM_CAP_EXCEPTION_PAYLOAD is enabled).
> +
> ARM/ARM64:
> 
> If the guest accesses a device that is being emulated by the host kernel in
> @@ -961,6 +967,9 @@ shall be written into the VCPU.
> 
> KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
> 
> +KVM_VCPUEVENT_VALID_PAYLOAD can only be set if KVM_CAP_EXCEPTION_PAYLOAD
> +is enabled.
> +
> ARM/ARM64:
> 
> Set the pending SError exception state for this VCPU. It is not possible to
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 09b2e3e2cf1b..026229a593f2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -585,6 +585,8 @@ struct kvm_vcpu_arch {
> 		bool has_error_code;
> 		u8 nr;
> 		u32 error_code;
> +		unsigned long payload;
> +		bool has_payload;
> 		u8 nested_apf;
> 	} exception;
> 
> @@ -871,6 +873,7 @@ struct kvm_arch {
> 	bool x2apic_broadcast_quirk_disabled;
> 
> 	bool guest_can_read_msr_platform_info;
> +	bool exception_payload_enabled;
> };
> 
> struct kvm_vm_stat {
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index fd23d5778ea1..e3ea52bdd461 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -288,6 +288,7 @@ struct kvm_reinject_control {
> #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
> #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
> #define KVM_VCPUEVENT_VALID_SMM		0x00000008
> +#define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
> 
> /* Interrupt shadow states */
> #define KVM_X86_SHADOW_INT_MOV_SS	0x01
> @@ -299,7 +300,7 @@ struct kvm_vcpu_events {
> 		__u8 injected;
> 		__u8 nr;
> 		__u8 has_error_code;
> -		__u8 pad;
> +		__u8 has_payload;
> 		__u32 error_code;
> 	} exception;
> 	struct {
> @@ -322,7 +323,8 @@ struct kvm_vcpu_events {
> 		__u8 smm_inside_nmi;
> 		__u8 latched_init;
> 	} smi;
> -	__u32 reserved[9];
> +	__u32 reserved[7];
> +	__u64 exception_payload;
> };
> 
> /* for KVM_GET/SET_DEBUGREGS */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index edbf00ec56b3..dbc538d66505 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -431,6 +431,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
> 		vcpu->arch.exception.has_error_code = has_error;
> 		vcpu->arch.exception.nr = nr;
> 		vcpu->arch.exception.error_code = error_code;
> +		vcpu->arch.exception.has_payload = false;
> +		vcpu->arch.exception.payload = 0;
> 		return;
> 	}
> 
> @@ -455,6 +457,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
> 		vcpu->arch.exception.has_error_code = true;
> 		vcpu->arch.exception.nr = DF_VECTOR;
> 		vcpu->arch.exception.error_code = 0;
> +		vcpu->arch.exception.has_payload = false;
> +		vcpu->arch.exception.payload = 0;
> 	} else
> 		/* replace previous exception with a new one in a hope
> 		   that instruction re-execution will regenerate lost
> @@ -3373,8 +3377,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
> 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
> 	events->exception.nr = vcpu->arch.exception.nr;
> 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
> -	events->exception.pad = 0;
> 	events->exception.error_code = vcpu->arch.exception.error_code;
> +	events->exception.has_payload = vcpu->arch.exception.has_payload;
> +	events->exception_payload = vcpu->arch.exception.payload;
> 
> 	events->interrupt.injected =
> 		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
> @@ -3398,6 +3403,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
> 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
> 			 | KVM_VCPUEVENT_VALID_SHADOW
> 			 | KVM_VCPUEVENT_VALID_SMM);
> +	if (vcpu->kvm->arch.exception_payload_enabled)
> +		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
> 	memset(&events->reserved, 0, sizeof(events->reserved));
> }
> 
> @@ -3409,12 +3416,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
> 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
> 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
> 			      | KVM_VCPUEVENT_VALID_SHADOW
> -			      | KVM_VCPUEVENT_VALID_SMM))
> +			      | KVM_VCPUEVENT_VALID_SMM
> +			      | KVM_VCPUEVENT_VALID_PAYLOAD))
> +		return -EINVAL;
> +
> +	if ((events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) &&
> +	    !vcpu->kvm->arch.exception_payload_enabled)
> 		return -EINVAL;
> 
> 	if (events->exception.injected &&
> 	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
> -	     is_guest_mode(vcpu)))
> +	     (is_guest_mode(vcpu) &&
> +	      !vcpu->kvm->arch.exception_payload_enabled)))
> 		return -EINVAL;
> 
> 	/* INITs are latched while in SMM */
> @@ -3429,6 +3442,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
> 	vcpu->arch.exception.nr = events->exception.nr;
> 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
> 	vcpu->arch.exception.error_code = events->exception.error_code;
> +	if (vcpu->kvm->arch.exception_payload_enabled) {
> +		vcpu->arch.exception.has_payload = events->exception.has_payload;
> +		vcpu->arch.exception.payload = events->exception_payload;
> +	} else {
> +		vcpu->arch.exception.has_payload = 0;
> +		vcpu->arch.exception.payload = 0;
> +	}
> 
> 	vcpu->arch.interrupt.injected = events->interrupt.injected;
> 	vcpu->arch.interrupt.nr = events->interrupt.nr;
> @@ -9463,6 +9483,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
> 			vcpu->arch.exception.nr = 0;
> 			vcpu->arch.exception.has_error_code = false;
> 			vcpu->arch.exception.error_code = 0;
> +			vcpu->arch.exception.has_payload = false;
> +			vcpu->arch.exception.payload = 0;
> 		} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
> 			fault.vector = PF_VECTOR;
> 			fault.error_code_valid = true;
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 86299efa804a..c4e4f03ce436 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -288,6 +288,7 @@ struct kvm_reinject_control {
> #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
> #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
> #define KVM_VCPUEVENT_VALID_SMM		0x00000008
> +#define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
> 
> /* Interrupt shadow states */
> #define KVM_X86_SHADOW_INT_MOV_SS	0x01
> @@ -299,7 +300,7 @@ struct kvm_vcpu_events {
> 		__u8 injected;
> 		__u8 nr;
> 		__u8 has_error_code;
> -		__u8 pad;
> +		__u8 has_payload;
> 		__u32 error_code;
> 	} exception;
> 	struct {
> @@ -322,7 +323,8 @@ struct kvm_vcpu_events {
> 		__u8 smm_inside_nmi;
> 		__u8 latched_init;
> 	} smi;
> -	__u32 reserved[9];
> +	__u32 reserved[7];
> +	__u64 payload;
> };
> 
> /* for KVM_GET/SET_DEBUGREGS */
> -- 
> 2.19.0.605.g01d371f741-goog
> 

Reviewed-by: Liran Alon <liran.alon@oracle.com>
Paolo Bonzini Oct. 9, 2018, 4:08 p.m. UTC | #2
On 08/10/2018 20:29, Jim Mattson wrote:
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 647f94128a85..2df2cca81cf5 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -850,7 +850,7 @@ struct kvm_vcpu_events {
>  		__u8 injected;
>  		__u8 nr;
>  		__u8 has_error_code;
> -		__u8 pad;
> +		__u8 has_payload;
>  		__u32 error_code;
>  	} exception;
>  	struct {
> @@ -873,9 +873,11 @@ struct kvm_vcpu_events {
>  		__u8 smm_inside_nmi;
>  		__u8 latched_init;
>  	} smi;
> +	__u32 reserved[7];
> +	__u64 exception_payload;
>  };

This changes the size of the struct, and thus the ioctl number.  The
good part is that you don't need the capability, the bad part is that
you need a struct kvm_vcpu_events2 and ioctl KVM_GET/SET_VCPU_EVENTS2.

Thanks,

Paolo
Jim Mattson Oct. 9, 2018, 4:25 p.m. UTC | #3
On Tue, Oct 9, 2018 at 9:08 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 08/10/2018 20:29, Jim Mattson wrote:
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index 647f94128a85..2df2cca81cf5 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -850,7 +850,7 @@ struct kvm_vcpu_events {
>>               __u8 injected;
>>               __u8 nr;
>>               __u8 has_error_code;
>> -             __u8 pad;
>> +             __u8 has_payload;
>>               __u32 error_code;
>>       } exception;
>>       struct {
>> @@ -873,9 +873,11 @@ struct kvm_vcpu_events {
>>               __u8 smm_inside_nmi;
>>               __u8 latched_init;
>>       } smi;
>> +     __u32 reserved[7];
>> +     __u64 exception_payload;
>>  };
>
> This changes the size of the struct, and thus the ioctl number.  The
> good part is that you don't need the capability, the bad part is that
> you need a struct kvm_vcpu_events2 and ioctl KVM_GET/SET_VCPU_EVENTS2.

It's not in the api.txt file, but there's an existing __u32
reserved[9] in arch/x86/include/uapi/asm/kvm.h.
Paolo Bonzini Oct. 9, 2018, 4:40 p.m. UTC | #4
On 09/10/2018 18:25, Jim Mattson wrote:
>> This changes the size of the struct, and thus the ioctl number.  The
>> good part is that you don't need the capability, the bad part is that
>> you need a struct kvm_vcpu_events2 and ioctl KVM_GET/SET_VCPU_EVENTS2.
> It's not in the api.txt file, but there's an existing __u32
> reserved[9] in arch/x86/include/uapi/asm/kvm.h.

Oh, good. :)

Paolo
Paolo Bonzini Oct. 15, 2018, 5:07 p.m. UTC | #5
On 08/10/2018 20:29, Jim Mattson wrote:
> Under nested virtualization, the L1 hypervisor may intercept an
> exception raised during the execution of L2 before the exception
> is delivered. When the intercepted exception is #PF, the VM-exit
> to the L1 hypervisor precedes the modification of CR2. When the
> intercepted exception is #DB, the VM-exit to the L1 hypervisor
> precedes the modifications of DR6 and DR7 under VMX, but the
> VM-exit to the L1 hypervisor follows the modifications of DR6 and
> DR7 under SVM.
> 
> At present, CR2 is modified too early under both VMX and SVM. DR6 is
> modified too early under VMX. DR7 is modified at the appropriate time.
> Unfortunately, it is possible to exit to userspace with one of these
> exceptions pending, and userspace may rely on the premature
> side-effects. It is also possible for userspace to inject one of these
> exceptions, in which case, userspace will presumably have already
> processed the side-effects.
> 
> To address this problem, a new per-VM capability
> (KVM_CAP_EXCEPTION_PAYLOAD) will be introduced. When this capability
> is enabled by userspace, the faulting linear address will be included
> with the information about a pending #PF in L2, and the "new DR6 bits"
> will be included with the information about a pending #DB in L2. This
> ancillary exception information is carried in a new "payload" field.
> 
> Reported-by: Jim Mattson <jmattson@google.com>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Jim Mattson <jmattson@google.com>
> Reviewed-by: Peter Shier <pshier@google.com>
> ---
>  Documentation/virtual/kvm/api.txt     | 13 +++++++++++--
>  arch/x86/include/asm/kvm_host.h       |  3 +++
>  arch/x86/include/uapi/asm/kvm.h       |  6 ++++--
>  arch/x86/kvm/x86.c                    | 28 ++++++++++++++++++++++++---
>  tools/arch/x86/include/uapi/asm/kvm.h |  6 ++++--
>  5 files changed, 47 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 647f94128a85..2df2cca81cf5 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -850,7 +850,7 @@ struct kvm_vcpu_events {
>  		__u8 injected;
>  		__u8 nr;
>  		__u8 has_error_code;
> -		__u8 pad;
> +		__u8 has_payload;
>  		__u32 error_code;
>  	} exception;
>  	struct {
> @@ -873,9 +873,11 @@ struct kvm_vcpu_events {
>  		__u8 smm_inside_nmi;
>  		__u8 latched_init;
>  	} smi;
> +	__u32 reserved[7];
> +	__u64 exception_payload;
>  };
>  
> -Only two fields are defined in the flags field:
> +The following bits are defined in the flags field:
>  
>  - KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
>    interrupt.shadow contains a valid state.
> @@ -883,6 +885,10 @@ Only two fields are defined in the flags field:
>  - KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
>    smi contains a valid state.
>  
> +- KVM_VCPUEVENT_VALID_PAYLOAD may be set in the flags field to signal that
> +  exception.has_payload and payload contain valid state
> +  (i.e. KVM_CAP_EXCEPTION_PAYLOAD is enabled).
> +
>  ARM/ARM64:
>  
>  If the guest accesses a device that is being emulated by the host kernel in
> @@ -961,6 +967,9 @@ shall be written into the VCPU.
>  
>  KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
>  
> +KVM_VCPUEVENT_VALID_PAYLOAD can only be set if KVM_CAP_EXCEPTION_PAYLOAD
> +is enabled.
> +
>  ARM/ARM64:
>  
>  Set the pending SError exception state for this VCPU. It is not possible to
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 09b2e3e2cf1b..026229a593f2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -585,6 +585,8 @@ struct kvm_vcpu_arch {
>  		bool has_error_code;
>  		u8 nr;
>  		u32 error_code;
> +		unsigned long payload;
> +		bool has_payload;
>  		u8 nested_apf;
>  	} exception;
>  
> @@ -871,6 +873,7 @@ struct kvm_arch {
>  	bool x2apic_broadcast_quirk_disabled;
>  
>  	bool guest_can_read_msr_platform_info;
> +	bool exception_payload_enabled;
>  };
>  
>  struct kvm_vm_stat {
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index fd23d5778ea1..e3ea52bdd461 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -288,6 +288,7 @@ struct kvm_reinject_control {
>  #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
>  #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
>  #define KVM_VCPUEVENT_VALID_SMM		0x00000008
> +#define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
>  
>  /* Interrupt shadow states */
>  #define KVM_X86_SHADOW_INT_MOV_SS	0x01
> @@ -299,7 +300,7 @@ struct kvm_vcpu_events {
>  		__u8 injected;
>  		__u8 nr;
>  		__u8 has_error_code;
> -		__u8 pad;
> +		__u8 has_payload;
>  		__u32 error_code;
>  	} exception;
>  	struct {
> @@ -322,7 +323,8 @@ struct kvm_vcpu_events {
>  		__u8 smm_inside_nmi;
>  		__u8 latched_init;
>  	} smi;
> -	__u32 reserved[9];
> +	__u32 reserved[7];
> +	__u64 exception_payload;
>  };
>  
>  /* for KVM_GET/SET_DEBUGREGS */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index edbf00ec56b3..dbc538d66505 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -431,6 +431,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
>  		vcpu->arch.exception.has_error_code = has_error;
>  		vcpu->arch.exception.nr = nr;
>  		vcpu->arch.exception.error_code = error_code;
> +		vcpu->arch.exception.has_payload = false;
> +		vcpu->arch.exception.payload = 0;
>  		return;
>  	}
>  
> @@ -455,6 +457,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
>  		vcpu->arch.exception.has_error_code = true;
>  		vcpu->arch.exception.nr = DF_VECTOR;
>  		vcpu->arch.exception.error_code = 0;
> +		vcpu->arch.exception.has_payload = false;
> +		vcpu->arch.exception.payload = 0;
>  	} else
>  		/* replace previous exception with a new one in a hope
>  		   that instruction re-execution will regenerate lost
> @@ -3373,8 +3377,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
>  		!kvm_exception_is_soft(vcpu->arch.exception.nr);
>  	events->exception.nr = vcpu->arch.exception.nr;
>  	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
> -	events->exception.pad = 0;
>  	events->exception.error_code = vcpu->arch.exception.error_code;
> +	events->exception.has_payload = vcpu->arch.exception.has_payload;
> +	events->exception_payload = vcpu->arch.exception.payload;
>  
>  	events->interrupt.injected =
>  		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
> @@ -3398,6 +3403,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
>  	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
>  			 | KVM_VCPUEVENT_VALID_SHADOW
>  			 | KVM_VCPUEVENT_VALID_SMM);
> +	if (vcpu->kvm->arch.exception_payload_enabled)
> +		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
>  	memset(&events->reserved, 0, sizeof(events->reserved));
>  }
>  
> @@ -3409,12 +3416,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
>  	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
>  			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
>  			      | KVM_VCPUEVENT_VALID_SHADOW
> -			      | KVM_VCPUEVENT_VALID_SMM))
> +			      | KVM_VCPUEVENT_VALID_SMM
> +			      | KVM_VCPUEVENT_VALID_PAYLOAD))
> +		return -EINVAL;
> +
> +	if ((events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) &&
> +	    !vcpu->kvm->arch.exception_payload_enabled)
>  		return -EINVAL;
>  
>  	if (events->exception.injected &&
>  	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
> -	     is_guest_mode(vcpu)))
> +	     (is_guest_mode(vcpu) &&
> +	      !vcpu->kvm->arch.exception_payload_enabled)))
>  		return -EINVAL;
>  
>  	/* INITs are latched while in SMM */
> @@ -3429,6 +3442,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
>  	vcpu->arch.exception.nr = events->exception.nr;
>  	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
>  	vcpu->arch.exception.error_code = events->exception.error_code;
> +	if (vcpu->kvm->arch.exception_payload_enabled) {
> +		vcpu->arch.exception.has_payload = events->exception.has_payload;
> +		vcpu->arch.exception.payload = events->exception_payload;
> +	} else {
> +		vcpu->arch.exception.has_payload = 0;
> +		vcpu->arch.exception.payload = 0;
> +	}
>  
>  	vcpu->arch.interrupt.injected = events->interrupt.injected;
>  	vcpu->arch.interrupt.nr = events->interrupt.nr;
> @@ -9463,6 +9483,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
>  			vcpu->arch.exception.nr = 0;
>  			vcpu->arch.exception.has_error_code = false;
>  			vcpu->arch.exception.error_code = 0;
> +			vcpu->arch.exception.has_payload = false;
> +			vcpu->arch.exception.payload = 0;
>  		} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
>  			fault.vector = PF_VECTOR;
>  			fault.error_code_valid = true;
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 86299efa804a..c4e4f03ce436 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -288,6 +288,7 @@ struct kvm_reinject_control {
>  #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
>  #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
>  #define KVM_VCPUEVENT_VALID_SMM		0x00000008
> +#define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
>  
>  /* Interrupt shadow states */
>  #define KVM_X86_SHADOW_INT_MOV_SS	0x01
> @@ -299,7 +300,7 @@ struct kvm_vcpu_events {
>  		__u8 injected;
>  		__u8 nr;
>  		__u8 has_error_code;
> -		__u8 pad;
> +		__u8 has_payload;
>  		__u32 error_code;
>  	} exception;
>  	struct {
> @@ -322,7 +323,8 @@ struct kvm_vcpu_events {
>  		__u8 smm_inside_nmi;
>  		__u8 latched_init;
>  	} smi;
> -	__u32 reserved[9];
> +	__u32 reserved[7];
> +	__u64 payload;
>  };
>  
>  /* for KVM_GET/SET_DEBUGREGS */
> 

I'm not applying any of the patches yet, but I'd be happy to queue it
for 4.20 even (slightly) after -rc1.  Once this is done, I think we can
flip nested=1.

Paolo
Liran Alon Oct. 15, 2018, 6:07 p.m. UTC | #6
> On 15 Oct 2018, at 20:07, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 08/10/2018 20:29, Jim Mattson wrote:
>> Under nested virtualization, the L1 hypervisor may intercept an
>> exception raised during the execution of L2 before the exception
>> is delivered. When the intercepted exception is #PF, the VM-exit
>> to the L1 hypervisor precedes the modification of CR2. When the
>> intercepted exception is #DB, the VM-exit to the L1 hypervisor
>> precedes the modifications of DR6 and DR7 under VMX, but the
>> VM-exit to the L1 hypervisor follows the modifications of DR6 and
>> DR7 under SVM.
>> 
>> At present, CR2 is modified too early under both VMX and SVM. DR6 is
>> modified too early under VMX. DR7 is modified at the appropriate time.
>> Unfortunately, it is possible to exit to userspace with one of these
>> exceptions pending, and userspace may rely on the premature
>> side-effects. It is also possible for userspace to inject one of these
>> exceptions, in which case, userspace will presumably have already
>> processed the side-effects.
>> 
>> To address this problem, a new per-VM capability
>> (KVM_CAP_EXCEPTION_PAYLOAD) will be introduced. When this capability
>> is enabled by userspace, the faulting linear address will be included
>> with the information about a pending #PF in L2, and the "new DR6 bits"
>> will be included with the information about a pending #DB in L2. This
>> ancillary exception information is carried in a new "payload" field.
>> 
>> Reported-by: Jim Mattson <jmattson@google.com>
>> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
>> Signed-off-by: Jim Mattson <jmattson@google.com>
>> Reviewed-by: Peter Shier <pshier@google.com>
>> 
> 
> I'm not applying any of the patches yet, but I'd be happy to queue it
> for 4.20 even (slightly) after -rc1.  Once this is done, I think we can
> flip nested=1.
> 
> Paolo

Just out of curiosity, why is this specifically a decision-point for flipping nested=1?
Because this is a break of userspace APIs in case of nVMX workloads?
What are the conditions for flipping it?

If that’s the case, this is also true for the get/set vCPU events IOCTLs.
So they need to also be applied before the flip. 

Don’t get me wrong, I will be happy to see nested=1 by default.
Just wondering…

-Liran
Paolo Bonzini Oct. 15, 2018, 6:42 p.m. UTC | #7
On 15/10/2018 20:07, Liran Alon wrote:
> Just out of curiosity, why is this specifically a decision-point for flipping nested=1?
> Because this is a break of userspace APIs in case of nVMX workloads?
> What are the conditions for flipping it?
> 
> If that’s the case, this is also true for the get/set vCPU events IOCTLs.
> So they need to also be applied before the flip. 

Yes, I meant v2 of the whole series.  It is not strictly necessary to
have this before the flip, but we're close enough that it makes sense to
order that.

Paolo
Liran Alon Oct. 15, 2018, 11:01 p.m. UTC | #8
> On 15 Oct 2018, at 21:42, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 15/10/2018 20:07, Liran Alon wrote:
>> Just out of curiosity, why is this specifically a decision-point for flipping nested=1?
>> Because this is a break of userspace APIs in case of nVMX workloads?
>> What are the conditions for flipping it?
>> 
>> If that’s the case, this is also true for the get/set vCPU events IOCTLs.
>> So they need to also be applied before the flip. 
> 
> Yes, I meant v2 of the whole series.  It is not strictly necessary to
> have this before the flip, but we're close enough that it makes sense to
> order that.
> 
> Paolo

Kinda weird that this decision seem to easily have been taken for AMD in commit:
4b6e4dca7011 ("KVM: SVM: enable nested svm by default”)
I highly doubt that KVM nSVM is really more stable than KVM nVMX…
But maybe I’m wrong.

-Liran
Paolo Bonzini Oct. 15, 2018, 11:47 p.m. UTC | #9
On 16/10/2018 01:01, Liran Alon wrote:
>> Yes, I meant v2 of the whole series.  It is not strictly necessary to
>> have this before the flip, but we're close enough that it makes sense to
>> order that.
>>
>> Paolo
> Kinda weird that this decision seem to easily have been taken for AMD in commit:
> 4b6e4dca7011 ("KVM: SVM: enable nested svm by default”)
> I highly doubt that KVM nSVM is really more stable than KVM nVMX…
> But maybe I’m wrong.

No, indeed.  That was not a good idea in retrospect.

Paolo

Patch
diff mbox series

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 647f94128a85..2df2cca81cf5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -850,7 +850,7 @@  struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 has_error_code;
-		__u8 pad;
+		__u8 has_payload;
 		__u32 error_code;
 	} exception;
 	struct {
@@ -873,9 +873,11 @@  struct kvm_vcpu_events {
 		__u8 smm_inside_nmi;
 		__u8 latched_init;
 	} smi;
+	__u32 reserved[7];
+	__u64 exception_payload;
 };
 
-Only two fields are defined in the flags field:
+The following bits are defined in the flags field:
 
 - KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
   interrupt.shadow contains a valid state.
@@ -883,6 +885,10 @@  Only two fields are defined in the flags field:
 - KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that
   smi contains a valid state.
 
+- KVM_VCPUEVENT_VALID_PAYLOAD may be set in the flags field to signal that
+  exception.has_payload and payload contain valid state
+  (i.e. KVM_CAP_EXCEPTION_PAYLOAD is enabled).
+
 ARM/ARM64:
 
 If the guest accesses a device that is being emulated by the host kernel in
@@ -961,6 +967,9 @@  shall be written into the VCPU.
 
 KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available.
 
+KVM_VCPUEVENT_VALID_PAYLOAD can only be set if KVM_CAP_EXCEPTION_PAYLOAD
+is enabled.
+
 ARM/ARM64:
 
 Set the pending SError exception state for this VCPU. It is not possible to
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09b2e3e2cf1b..026229a593f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -585,6 +585,8 @@  struct kvm_vcpu_arch {
 		bool has_error_code;
 		u8 nr;
 		u32 error_code;
+		unsigned long payload;
+		bool has_payload;
 		u8 nested_apf;
 	} exception;
 
@@ -871,6 +873,7 @@  struct kvm_arch {
 	bool x2apic_broadcast_quirk_disabled;
 
 	bool guest_can_read_msr_platform_info;
+	bool exception_payload_enabled;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index fd23d5778ea1..e3ea52bdd461 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -288,6 +288,7 @@  struct kvm_reinject_control {
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
 #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
 #define KVM_VCPUEVENT_VALID_SMM		0x00000008
+#define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
 
 /* Interrupt shadow states */
 #define KVM_X86_SHADOW_INT_MOV_SS	0x01
@@ -299,7 +300,7 @@  struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 has_error_code;
-		__u8 pad;
+		__u8 has_payload;
 		__u32 error_code;
 	} exception;
 	struct {
@@ -322,7 +323,8 @@  struct kvm_vcpu_events {
 		__u8 smm_inside_nmi;
 		__u8 latched_init;
 	} smi;
-	__u32 reserved[9];
+	__u32 reserved[7];
+	__u64 exception_payload;
 };
 
 /* for KVM_GET/SET_DEBUGREGS */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index edbf00ec56b3..dbc538d66505 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -431,6 +431,8 @@  static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		vcpu->arch.exception.has_error_code = has_error;
 		vcpu->arch.exception.nr = nr;
 		vcpu->arch.exception.error_code = error_code;
+		vcpu->arch.exception.has_payload = false;
+		vcpu->arch.exception.payload = 0;
 		return;
 	}
 
@@ -455,6 +457,8 @@  static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		vcpu->arch.exception.has_error_code = true;
 		vcpu->arch.exception.nr = DF_VECTOR;
 		vcpu->arch.exception.error_code = 0;
+		vcpu->arch.exception.has_payload = false;
+		vcpu->arch.exception.payload = 0;
 	} else
 		/* replace previous exception with a new one in a hope
 		   that instruction re-execution will regenerate lost
@@ -3373,8 +3377,9 @@  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
 	events->exception.nr = vcpu->arch.exception.nr;
 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-	events->exception.pad = 0;
 	events->exception.error_code = vcpu->arch.exception.error_code;
+	events->exception.has_payload = vcpu->arch.exception.has_payload;
+	events->exception_payload = vcpu->arch.exception.payload;
 
 	events->interrupt.injected =
 		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@ -3398,6 +3403,8 @@  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 			 | KVM_VCPUEVENT_VALID_SHADOW
 			 | KVM_VCPUEVENT_VALID_SMM);
+	if (vcpu->kvm->arch.exception_payload_enabled)
+		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
 	memset(&events->reserved, 0, sizeof(events->reserved));
 }
 
@@ -3409,12 +3416,18 @@  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			      | KVM_VCPUEVENT_VALID_SHADOW
-			      | KVM_VCPUEVENT_VALID_SMM))
+			      | KVM_VCPUEVENT_VALID_SMM
+			      | KVM_VCPUEVENT_VALID_PAYLOAD))
+		return -EINVAL;
+
+	if ((events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) &&
+	    !vcpu->kvm->arch.exception_payload_enabled)
 		return -EINVAL;
 
 	if (events->exception.injected &&
 	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
-	     is_guest_mode(vcpu)))
+	     (is_guest_mode(vcpu) &&
+	      !vcpu->kvm->arch.exception_payload_enabled)))
 		return -EINVAL;
 
 	/* INITs are latched while in SMM */
@@ -3429,6 +3442,13 @@  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	vcpu->arch.exception.nr = events->exception.nr;
 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
 	vcpu->arch.exception.error_code = events->exception.error_code;
+	if (vcpu->kvm->arch.exception_payload_enabled) {
+		vcpu->arch.exception.has_payload = events->exception.has_payload;
+		vcpu->arch.exception.payload = events->exception_payload;
+	} else {
+		vcpu->arch.exception.has_payload = 0;
+		vcpu->arch.exception.payload = 0;
+	}
 
 	vcpu->arch.interrupt.injected = events->interrupt.injected;
 	vcpu->arch.interrupt.nr = events->interrupt.nr;
@@ -9463,6 +9483,8 @@  void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 			vcpu->arch.exception.nr = 0;
 			vcpu->arch.exception.has_error_code = false;
 			vcpu->arch.exception.error_code = 0;
+			vcpu->arch.exception.has_payload = false;
+			vcpu->arch.exception.payload = 0;
 		} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
 			fault.vector = PF_VECTOR;
 			fault.error_code_valid = true;
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 86299efa804a..c4e4f03ce436 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -288,6 +288,7 @@  struct kvm_reinject_control {
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
 #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
 #define KVM_VCPUEVENT_VALID_SMM		0x00000008
+#define KVM_VCPUEVENT_VALID_PAYLOAD	0x00000010
 
 /* Interrupt shadow states */
 #define KVM_X86_SHADOW_INT_MOV_SS	0x01
@@ -299,7 +300,7 @@  struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 has_error_code;
-		__u8 pad;
+		__u8 has_payload;
 		__u32 error_code;
 	} exception;
 	struct {
@@ -322,7 +323,8 @@  struct kvm_vcpu_events {
 		__u8 smm_inside_nmi;
 		__u8 latched_init;
 	} smi;
-	__u32 reserved[9];
+	__u32 reserved[7];
+	__u64 payload;
 };
 
 /* for KVM_GET/SET_DEBUGREGS */