diff mbox series

[v6,2/4] KVM: x86: report negative values from wrmsr emulation to userspace

Message ID 20200922211025.175547-3-mlevitsk@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: nSVM: ondemand nested state allocation | expand

Commit Message

Maxim Levitsky Sept. 22, 2020, 9:10 p.m. UTC
This will allow the KVM to report such errors (e.g -ENOMEM)
to the userspace.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++++--
 arch/x86/kvm/x86.c     | 6 +++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

Comments

Qian Cai Oct. 26, 2020, 7:40 p.m. UTC | #1
On Wed, 2020-09-23 at 00:10 +0300, Maxim Levitsky wrote:
> This will allow the KVM to report such errors (e.g -ENOMEM)
> to the userspace.
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>

Reverting this and its dependency:

72f211ecaa80 KVM: x86: allow kvm_x86_ops.set_efer to return an error value

on the top of linux-next (they have also unfortunately merged into the mainline
at the same time) fixed an issue that a simple Intel KVM guest is unable to boot
below.

.config: http://people.redhat.com/qcai/x86.config

qemu-kvm-4.2.0-34.module+el8.3.0+7976+077be4ec.x86_64

# /usr/libexec/qemu-kvm -name ubuntu-20.04-server-cloudimg -cpu host -smp 2 -m 2g -hda ./ubuntu-20.04-server-cloudimg.qcow2 -cdrom ./ubuntu-20.04-server-cloudimg.iso  -nic user,hostfwd=tcp::2222-:22 -nographic

[    1.141022] evm: Initialising EVM extended attributes:
[    1.143344] evm: security.selinux
[    1.144968] evm: security.SMACK64
[    1.146574] evm: security.SMACK64EXEC
[    1.148305] evm: security.SMACK64TRANSMUTE
[    1.150215] evm: security.SMACK64MMAP
[    1.151960] evm: security.apparmor
[    1.153755] evm: security.ima
[    1.155454] evm: security.capability
[    1.155456] evm: HMAC attrs: 0x1
[    1.162331] ata1.00: ATA-7: QEMU HARDDISK, 2.5+, max UDMA/100
[    1.162635] PM:   Magic number: 8:937:635
[    1.165607] ata1.00: 2147483648 sectors, multi 16: LBA48 
[    1.169799] scsi 0:0:0:0: Direct-Access     ATA      QEMU HARDDISK    2.5+ PQ: 0 ANSI: 5
[    1.174196] rtc_cmos 00:00: setting system clock to 2020-10-26T13:38:53 UTC (1603719533)
[    1.178237] sd 0:0:0:0: Attached scsi generic sg0 type 0
[    1.178293] sd 0:0:0:0: [sda] 2147483648 512-byte logical blocks: (1.10 TB/1.00 TiB)
[    1.180567] ata2.00: ATAPI: QEMU DVD-ROM, 2.5+, max UDMA/100
[    1.183986] sd 0:0:0:0: [sda] Write Protect is off
[error: kvm run failed No such file or directory
 RAX=0000000000000000 RBX=0000000000000000 RCX=0000000000000150 RDX=000000008000001c
RSI=0000000000000000 RDI=0000000000000150 RBP=ffffb67840083e40 RSP=ffffb67840083e00
R8 =ffff931dfda17608 R9 =0000000000000000 R10=ffff931dfda17848 R11=0000000000000000
R12=0000000000000000 R13=00000000000000b7 R14=ffff931dfd4013c0 R15=ffffffffaa8f48d0
RIP=ffffffffaa078894 RFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 0000000000000000 ffffffff 00c00000
CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA]
SS =0000 0000000000000000 ffffffff 00c00000
DS =0000 0000000000000000 ffffffff 00c00000
FS =0000 0000000000000000 ffffffff 00c00000
GS =0000 ffff931dfda00000 ffffffff 00c00000
LDT=0000 0000000000000000 ffffffff 00c00000
TR =0040 fffffe0000003000 0000206f 00008b00 DPL=0 TSS64-busy
GDT=     fffffe0000001000 0000007f
IDT=     fffffe0000000000 00000fff
CR0=80050033 CR2=0000000000000000 CR3=000000002960a001 CR4=00760ef0
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 
DR6=00000000fffe0ff0 DR7=0000000000000400
EFER=0000000000000d01
Code=dc 60 4e 00 4c 89 e0 41 5c 5d c3 0f 1f 44 00 00 89 f0 89 f9 <0f> 30 31 c0 0f 1f 44 00 00 c3 55 48 c1 e2 20 89 f6 48 09 d6 89 c2 48 89 e5 48 83 ec 08 89

> ---
>  arch/x86/kvm/emulate.c | 7 +++++--
>  arch/x86/kvm/x86.c     | 6 +++++-
>  2 files changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 1d450d7710d63..d855304f5a509 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -3702,13 +3702,16 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
>  static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
>  {
>  	u64 msr_data;
> +	int ret;
>  
>  	msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
>  		| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
> -	if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
> +
> +	ret = ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data);
> +	if (ret > 0)
>  		return emulate_gp(ctxt, 0);
>  
> -	return X86EMUL_CONTINUE;
> +	return ret < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
>  }
>  
>  static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 063d70e736f7f..e4b07be450d4e 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1612,8 +1612,12 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
>  {
>  	u32 ecx = kvm_rcx_read(vcpu);
>  	u64 data = kvm_read_edx_eax(vcpu);
> +	int ret = kvm_set_msr(vcpu, ecx, data);
>  
> -	if (kvm_set_msr(vcpu, ecx, data)) {
> +	if (ret < 0)
> +		return ret;
> +
> +	if (ret > 0) {
>  		trace_kvm_msr_write_ex(ecx, data);
>  		kvm_inject_gp(vcpu, 0);
>  		return 1;
Qian Cai Oct. 27, 2020, 8:31 p.m. UTC | #2
On Mon, 2020-10-26 at 15:40 -0400, Qian Cai wrote:
> On Wed, 2020-09-23 at 00:10 +0300, Maxim Levitsky wrote:
> > This will allow the KVM to report such errors (e.g -ENOMEM)
> > to the userspace.
> > 
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> 
> Reverting this and its dependency:
> 
> 72f211ecaa80 KVM: x86: allow kvm_x86_ops.set_efer to return an error value
> 
> on the top of linux-next (they have also unfortunately merged into the
> mainline
> at the same time) fixed an issue that a simple Intel KVM guest is unable to
> boot
> below.

So I debug this a bit more. This also breaks nested virt (VMX). We have here:

[  345.504403] kvm [1491]: vcpu0 unhandled rdmsr: 0x4e data 0x0
[  345.758560] kvm [1491]: vcpu0 unhandled rdmsr: 0x1c9 data 0x0
[  345.758594] kvm [1491]: vcpu0 unhandled rdmsr: 0x1a6 data 0x0
[  345.758619] kvm [1491]: vcpu0 unhandled rdmsr: 0x1a7 data 0x0
[  345.758644] kvm [1491]: vcpu0 unhandled rdmsr: 0x3f6 data 0x0
[  345.951601] kvm [1493]: vcpu1 unhandled rdmsr: 0x4e data 0x0
[  351.857036] kvm [1493]: vcpu1 unhandled wrmsr: 0xc90 data 0xfffff

After this commit, -ENOENT is returned to vcpu_enter_guest() causes the
userspace to abort.

kvm_msr_ignored_check()
  kvm_set_msr()
    kvm_emulate_wrmsr()
      vmx_handle_exit()
        vcpu_enter_guest()

Something like below will unbreak the userspace, but does anyone has a better
idea?

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1748,7 +1748,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
                return 0;
 
        /* Signal all other negative errors to userspace */
-       if (r < 0)
+       if (r < 0 && r != -ENOENT)
                return r;
 
        /* MSR write failed? Inject a #GP */
Maxim Levitsky Oct. 28, 2020, 8:51 a.m. UTC | #3
On Tue, 2020-10-27 at 16:31 -0400, Qian Cai wrote:
> On Mon, 2020-10-26 at 15:40 -0400, Qian Cai wrote:
> > On Wed, 2020-09-23 at 00:10 +0300, Maxim Levitsky wrote:
> > > This will allow the KVM to report such errors (e.g -ENOMEM)
> > > to the userspace.
> > > 
> > > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > 
> > Reverting this and its dependency:
> > 
> > 72f211ecaa80 KVM: x86: allow kvm_x86_ops.set_efer to return an error value
> > 
> > on the top of linux-next (they have also unfortunately merged into the
> > mainline
> > at the same time) fixed an issue that a simple Intel KVM guest is unable to
> > boot
> > below.
> 
> So I debug this a bit more. This also breaks nested virt (VMX). We have here:
> 
> [  345.504403] kvm [1491]: vcpu0 unhandled rdmsr: 0x4e data 0x0
> [  345.758560] kvm [1491]: vcpu0 unhandled rdmsr: 0x1c9 data 0x0
> [  345.758594] kvm [1491]: vcpu0 unhandled rdmsr: 0x1a6 data 0x0
> [  345.758619] kvm [1491]: vcpu0 unhandled rdmsr: 0x1a7 data 0x0
> [  345.758644] kvm [1491]: vcpu0 unhandled rdmsr: 0x3f6 data 0x0
> [  345.951601] kvm [1493]: vcpu1 unhandled rdmsr: 0x4e data 0x0
> [  351.857036] kvm [1493]: vcpu1 unhandled wrmsr: 0xc90 data 0xfffff
> 
> After this commit, -ENOENT is returned to vcpu_enter_guest() causes the
> userspace to abort.

Thank you very much for debugging it!

Yestarday I tried pretty much everything to reproduce it on my intel's laptop 
but I wasn't able.

I tried kvm/queue, then latest mainline, linux-next on my laptop
and all worked and even booted nested guests.

For qemu side,
I even tried RHEL's qemu, exact version as you tested.

I got really unlucky here that it seems that none of my guests ever write
an unknown msr.
Now with the information you provided, it is trivial to reproduce it 
even on my AMD machine -
All I need to do is to write an unknown msr, 
something like 'wrmsr 0x1234 0x0' using wrmsr tool.

And for the root cause of this, this is the fallout of last minute rebase of my code on top
of the userspace msr feature. I missed this -ENOENT logic that clashes with mine.

> 
> kvm_msr_ignored_check()
>   kvm_set_msr()
>     kvm_emulate_wrmsr()
>       vmx_handle_exit()
>         vcpu_enter_guest()
> 
> Something like below will unbreak the userspace, but does anyone has a better
> idea?

Checking for -ENOENT might be a right solution but I'll check now in depth,
what else interactions are affected and if this can be done closer to the
place where it happens.

Also, I am more inclined to add a new positive error code (similiar to KVM_MSR_RET_INVALID)
to indicate 'msr not found' error since this error condition is arch defined.

My reasoning is that positive error values should be used for error conditions
that cause #GP to the guest, while negative error values should only be used
for internal errors which are propogated to qemu userspace and usually kill
the guest.

I'll prepare a patch for this very soon.

Best regards,
	Maxim Levitsky 


> 
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1748,7 +1748,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
>                 return 0;
>  
>         /* Signal all other negative errors to userspace */
> -       if (r < 0)
> +       if (r < 0 && r != -ENOENT)
>                 return r;
>  
>         /* MSR write failed? Inject a #GP */
>
diff mbox series

Patch

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1d450d7710d63..d855304f5a509 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3702,13 +3702,16 @@  static int em_dr_write(struct x86_emulate_ctxt *ctxt)
 static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
 {
 	u64 msr_data;
+	int ret;
 
 	msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
 		| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
-	if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
+
+	ret = ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data);
+	if (ret > 0)
 		return emulate_gp(ctxt, 0);
 
-	return X86EMUL_CONTINUE;
+	return ret < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
 }
 
 static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 063d70e736f7f..e4b07be450d4e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1612,8 +1612,12 @@  int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 {
 	u32 ecx = kvm_rcx_read(vcpu);
 	u64 data = kvm_read_edx_eax(vcpu);
+	int ret = kvm_set_msr(vcpu, ecx, data);
 
-	if (kvm_set_msr(vcpu, ecx, data)) {
+	if (ret < 0)
+		return ret;
+
+	if (ret > 0) {
 		trace_kvm_msr_write_ex(ecx, data);
 		kvm_inject_gp(vcpu, 0);
 		return 1;