diff mbox series

kvm: x86: Add logical CPU to KVM_EXIT_FAIL_ENTRY info

Message ID 20191213231646.88015-1-jmattson@google.com (mailing list archive)
State New, archived
Headers show
Series kvm: x86: Add logical CPU to KVM_EXIT_FAIL_ENTRY info | expand

Commit Message

Jim Mattson Dec. 13, 2019, 11:16 p.m. UTC
More often than not, a failed VM-entry in a production environment is
the result of a defective CPU (at least, insofar as Intel x86 is
concerned). To aid in identifying the bad hardware, add the logical
CPU to the information provided to userspace on a KVM exit with reason
KVM_EXIT_FAIL_ENTRY. The presence of this additional information is
indicated by a new capability, KVM_CAP_FAILED_ENTRY_CPU.

Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Peter Shier <pshier@google.com>
Reviewed-by: Oliver Upton <oupton@google.com>
---
 Documentation/virt/kvm/api.txt | 1 +
 arch/x86/kvm/svm.c             | 1 +
 arch/x86/kvm/vmx/vmx.c         | 2 ++
 arch/x86/kvm/x86.c             | 1 +
 include/uapi/linux/kvm.h       | 2 ++
 5 files changed, 7 insertions(+)

Comments

Liran Alon Dec. 13, 2019, 11:26 p.m. UTC | #1
> On 14 Dec 2019, at 1:16, Jim Mattson <jmattson@google.com> wrote:
> 
> More often than not, a failed VM-entry in a production environment is
> the result of a defective CPU (at least, insofar as Intel x86 is
> concerned). To aid in identifying the bad hardware, add the logical
> CPU to the information provided to userspace on a KVM exit with reason
> KVM_EXIT_FAIL_ENTRY. The presence of this additional information is
> indicated by a new capability, KVM_CAP_FAILED_ENTRY_CPU.
> 
> Signed-off-by: Jim Mattson <jmattson@google.com>
> Reviewed-by: Peter Shier <pshier@google.com>
> Reviewed-by: Oliver Upton <oupton@google.com>
> ---
> Documentation/virt/kvm/api.txt | 1 +
> arch/x86/kvm/svm.c             | 1 +
> arch/x86/kvm/vmx/vmx.c         | 2 ++
> arch/x86/kvm/x86.c             | 1 +
> include/uapi/linux/kvm.h       | 2 ++
> 5 files changed, 7 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> index ebb37b34dcfc..6e5d92406b65 100644
> --- a/Documentation/virt/kvm/api.txt
> +++ b/Documentation/virt/kvm/api.txt
> @@ -4245,6 +4245,7 @@ hardware_exit_reason.
> 		/* KVM_EXIT_FAIL_ENTRY */
> 		struct {
> 			__u64 hardware_entry_failure_reason;
> +			__u32 cpu; /* if KVM_CAP_FAILED_ENTRY_CPU */
> 		} fail_entry;
> 
> If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 122d4ce3b1ab..4d06b2413c63 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -4980,6 +4980,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
> 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> 		kvm_run->fail_entry.hardware_entry_failure_reason
> 			= svm->vmcb->control.exit_code;
> +		kvm_run->fail_entry.cpu = raw_smp_processor_id();

Why not just use vcpu->cpu?
Same for vmx_handle_exit() to be consistent.

> 		dump_vmcb(vcpu);
> 		return 0;
> 	}
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index e3394c839dea..4d540b1c08e0 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5846,6 +5846,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> 		vcpu->run->fail_entry.hardware_entry_failure_reason
> 			= exit_reason;
> +		vcpu->run->fail_entry.cpu = vmx->loaded_vmcs->cpu;
> 		return 0;
> 	}
> 
> @@ -5854,6 +5855,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> 		vcpu->run->fail_entry.hardware_entry_failure_reason
> 			= vmcs_read32(VM_INSTRUCTION_ERROR);
> +		vcpu->run->fail_entry.cpu = vmx->loaded_vmcs->cpu;
> 		return 0;
> 	}
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cf917139de6b..9e89a32056d1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3273,6 +3273,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> 	case KVM_CAP_GET_MSR_FEATURES:
> 	case KVM_CAP_MSR_PLATFORM_INFO:
> 	case KVM_CAP_EXCEPTION_PAYLOAD:
> +	case KVM_CAP_FAILED_ENTRY_CPU:
> 		r = 1;
> 		break;
> 	case KVM_CAP_SYNC_REGS:
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index f0a16b4adbbd..09ba7174456d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -277,6 +277,7 @@ struct kvm_run {
> 		/* KVM_EXIT_FAIL_ENTRY */
> 		struct {
> 			__u64 hardware_entry_failure_reason;
> +			__u32 cpu;
> 		} fail_entry;
> 		/* KVM_EXIT_EXCEPTION */
> 		struct {
> @@ -1009,6 +1010,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176
> #define KVM_CAP_ARM_NISV_TO_USER 177
> #define KVM_CAP_ARM_INJECT_EXT_DABT 178
> +#define KVM_CAP_FAILED_ENTRY_CPU 179
> 
> #ifdef KVM_CAP_IRQ_ROUTING
> 
> -- 
> 2.24.1.735.g03f4e72817-goog
>
Jim Mattson Dec. 13, 2019, 11:29 p.m. UTC | #2
On Fri, Dec 13, 2019 at 3:26 PM Liran Alon <liran.alon@oracle.com> wrote:
>
>
>
> > On 14 Dec 2019, at 1:16, Jim Mattson <jmattson@google.com> wrote:
> >
> > More often than not, a failed VM-entry in a production environment is
> > the result of a defective CPU (at least, insofar as Intel x86 is
> > concerned). To aid in identifying the bad hardware, add the logical
> > CPU to the information provided to userspace on a KVM exit with reason
> > KVM_EXIT_FAIL_ENTRY. The presence of this additional information is
> > indicated by a new capability, KVM_CAP_FAILED_ENTRY_CPU.
> >
> > Signed-off-by: Jim Mattson <jmattson@google.com>
> > Reviewed-by: Peter Shier <pshier@google.com>
> > Reviewed-by: Oliver Upton <oupton@google.com>
> > ---
> > Documentation/virt/kvm/api.txt | 1 +
> > arch/x86/kvm/svm.c             | 1 +
> > arch/x86/kvm/vmx/vmx.c         | 2 ++
> > arch/x86/kvm/x86.c             | 1 +
> > include/uapi/linux/kvm.h       | 2 ++
> > 5 files changed, 7 insertions(+)
> >
> > diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> > index ebb37b34dcfc..6e5d92406b65 100644
> > --- a/Documentation/virt/kvm/api.txt
> > +++ b/Documentation/virt/kvm/api.txt
> > @@ -4245,6 +4245,7 @@ hardware_exit_reason.
> >               /* KVM_EXIT_FAIL_ENTRY */
> >               struct {
> >                       __u64 hardware_entry_failure_reason;
> > +                     __u32 cpu; /* if KVM_CAP_FAILED_ENTRY_CPU */
> >               } fail_entry;
> >
> > If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 122d4ce3b1ab..4d06b2413c63 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -4980,6 +4980,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
> >               kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> >               kvm_run->fail_entry.hardware_entry_failure_reason
> >                       = svm->vmcb->control.exit_code;
> > +             kvm_run->fail_entry.cpu = raw_smp_processor_id();
>
> Why not just use vcpu->cpu?
> Same for vmx_handle_exit() to be consistent.

Ah. Perfect. Thanks.

> >               dump_vmcb(vcpu);
> >               return 0;
> >       }
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index e3394c839dea..4d540b1c08e0 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -5846,6 +5846,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> >               vcpu->run->fail_entry.hardware_entry_failure_reason
> >                       = exit_reason;
> > +             vcpu->run->fail_entry.cpu = vmx->loaded_vmcs->cpu;
> >               return 0;
> >       }
> >
> > @@ -5854,6 +5855,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> >               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> >               vcpu->run->fail_entry.hardware_entry_failure_reason
> >                       = vmcs_read32(VM_INSTRUCTION_ERROR);
> > +             vcpu->run->fail_entry.cpu = vmx->loaded_vmcs->cpu;
> >               return 0;
> >       }
> >
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index cf917139de6b..9e89a32056d1 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3273,6 +3273,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> >       case KVM_CAP_GET_MSR_FEATURES:
> >       case KVM_CAP_MSR_PLATFORM_INFO:
> >       case KVM_CAP_EXCEPTION_PAYLOAD:
> > +     case KVM_CAP_FAILED_ENTRY_CPU:
> >               r = 1;
> >               break;
> >       case KVM_CAP_SYNC_REGS:
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index f0a16b4adbbd..09ba7174456d 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -277,6 +277,7 @@ struct kvm_run {
> >               /* KVM_EXIT_FAIL_ENTRY */
> >               struct {
> >                       __u64 hardware_entry_failure_reason;
> > +                     __u32 cpu;
> >               } fail_entry;
> >               /* KVM_EXIT_EXCEPTION */
> >               struct {
> > @@ -1009,6 +1010,7 @@ struct kvm_ppc_resize_hpt {
> > #define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176
> > #define KVM_CAP_ARM_NISV_TO_USER 177
> > #define KVM_CAP_ARM_INJECT_EXT_DABT 178
> > +#define KVM_CAP_FAILED_ENTRY_CPU 179
> >
> > #ifdef KVM_CAP_IRQ_ROUTING
> >
> > --
> > 2.24.1.735.g03f4e72817-goog
> >
>
Paolo Bonzini Dec. 14, 2019, 7:32 a.m. UTC | #3
On 14/12/19 00:16, Jim Mattson wrote:
> More often than not, a failed VM-entry in a production environment is
> the result of a defective CPU (at least, insofar as Intel x86 is
> concerned).

It's conforting that someone else got to the same conclusion as we did...

Paolo

> To aid in identifying the bad hardware, add the logical
> CPU to the information provided to userspace on a KVM exit with reason
> KVM_EXIT_FAIL_ENTRY. The presence of this additional information is
> indicated by a new capability, KVM_CAP_FAILED_ENTRY_CPU.
> 
> Signed-off-by: Jim Mattson <jmattson@google.com>
> Reviewed-by: Peter Shier <pshier@google.com>
> Reviewed-by: Oliver Upton <oupton@google.com>
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index ebb37b34dcfc..6e5d92406b65 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -4245,6 +4245,7 @@  hardware_exit_reason.
 		/* KVM_EXIT_FAIL_ENTRY */
 		struct {
 			__u64 hardware_entry_failure_reason;
+			__u32 cpu; /* if KVM_CAP_FAILED_ENTRY_CPU */
 		} fail_entry;
 
 If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 122d4ce3b1ab..4d06b2413c63 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4980,6 +4980,7 @@  static int handle_exit(struct kvm_vcpu *vcpu)
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
 			= svm->vmcb->control.exit_code;
+		kvm_run->fail_entry.cpu = raw_smp_processor_id();
 		dump_vmcb(vcpu);
 		return 0;
 	}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e3394c839dea..4d540b1c08e0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5846,6 +5846,7 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
 			= exit_reason;
+		vcpu->run->fail_entry.cpu = vmx->loaded_vmcs->cpu;
 		return 0;
 	}
 
@@ -5854,6 +5855,7 @@  static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
 			= vmcs_read32(VM_INSTRUCTION_ERROR);
+		vcpu->run->fail_entry.cpu = vmx->loaded_vmcs->cpu;
 		return 0;
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf917139de6b..9e89a32056d1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3273,6 +3273,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_GET_MSR_FEATURES:
 	case KVM_CAP_MSR_PLATFORM_INFO:
 	case KVM_CAP_EXCEPTION_PAYLOAD:
+	case KVM_CAP_FAILED_ENTRY_CPU:
 		r = 1;
 		break;
 	case KVM_CAP_SYNC_REGS:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f0a16b4adbbd..09ba7174456d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -277,6 +277,7 @@  struct kvm_run {
 		/* KVM_EXIT_FAIL_ENTRY */
 		struct {
 			__u64 hardware_entry_failure_reason;
+			__u32 cpu;
 		} fail_entry;
 		/* KVM_EXIT_EXCEPTION */
 		struct {
@@ -1009,6 +1010,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176
 #define KVM_CAP_ARM_NISV_TO_USER 177
 #define KVM_CAP_ARM_INJECT_EXT_DABT 178
+#define KVM_CAP_FAILED_ENTRY_CPU 179
 
 #ifdef KVM_CAP_IRQ_ROUTING